Removed candidate buffering in ElementSpans
diff --git a/Changes b/Changes
index 4201e83..de2ac93 100644
--- a/Changes
+++ b/Changes
@@ -1,11 +1,11 @@
-0.49.4 2015-02-04
+0.49.4 2015-02-05
- [documentation] Improved documentation for API classes,
improved test coverage for utility classes (diewald)
- [performance] Updated Lucene dependency from 4.5.1 to 4.10.3,
Updated Jackson dependency from 2.4.0 to 2.4.4,
Updated Jersey dependency from 2.4.1 to 2.15 (diewald)
- [feature] Presorting of element terms in the index for coherent
- SpanQuery sorting (diewald)
+ SpanQuery sorting; Removed buffering of element candidates (diewald)
Warning: This is a breaking change!
- [cleanup] Renamed /filter to /collection,
merge KorapHTML and KorapString (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java
index a4d7159..c51665e 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java
@@ -18,16 +18,13 @@
import de.ids_mannheim.korap.query.SpanElementQuery;
/**
- * Enumeration of spans which are elements such as phrases, sentences and
- * paragraphs.
+ * Enumeration of special spans which length is stored in their payload,
+ * representing elements such as phrases, sentences and paragraphs.
*
* @author margaretha
* @author diewald
*/
public class ElementSpans extends SpansWithId {
-
- private List<CandidateElementSpan> candidateList;
- private int currentDoc, currentPosition;
private TermSpans termSpans;
/**
@@ -40,156 +37,82 @@
* @throws IOException
*/
public ElementSpans(SpanElementQuery spanElementQuery,
- AtomicReaderContext context, Bits acceptDocs,
- Map<Term, TermContext> termContexts) throws IOException {
+ AtomicReaderContext context, Bits acceptDocs,
+ Map<Term, TermContext> termContexts) throws IOException {
super(spanElementQuery, context, acceptDocs, termContexts);
- candidateList = new ArrayList<>();
- termSpans = (TermSpans) firstSpans;
- hasMoreSpans = termSpans.next();
- if (hasMoreSpans) {
- currentDoc = termSpans.doc();
- currentPosition = termSpans.start();
- }
- }
+ termSpans = (TermSpans) this.firstSpans;
+ hasMoreSpans = true;
+ };
+
@Override
public boolean next() throws IOException {
isStartEnumeration = false;
- return advance();
- }
- /**
- * Advances the ElementSpans to the next match by first checking the
- * candidate match list. If the list is empty, it will be set/filled in
- * first. Tells if there is a next match or not.
- *
- * @return <code>true</code> if a match is found, <code>false</code>
- * otherwise.
- * @throws IOException
- */
- private boolean advance() throws IOException {
- while (hasMoreSpans || !candidateList.isEmpty()) {
- if (!candidateList.isEmpty()) {
- CandidateElementSpan cs = candidateList.get(0);
- this.matchDocNumber = cs.getDoc();
- this.matchStartPosition = cs.getStart();
- this.matchEndPosition = cs.getEnd();
- this.matchPayload = cs.getPayloads();
- // this.setElementRef(cs.getSpanId());
- this.setSpanId(cs.getSpanId());
- candidateList.remove(0);
- return true;
- } else {
- // logger.info("Setting candidate list");
- setCandidateList();
- currentDoc = termSpans.doc();
- currentPosition = termSpans.start();
- }
- }
- return false;
- }
+ if (!hasMoreSpans || !(hasMoreSpans = termSpans.next()))
+ return false;
- /**
- * Collects all the elements starting at the same position and sort them by
- * their end positions. The list starts with the element having the smallest
- * end position.
- *
- * @throws IOException
- */
- private void setCandidateList() throws IOException {
- while (hasMoreSpans && termSpans.doc() == currentDoc
- && termSpans.start() == currentPosition) {
- CandidateElementSpan cs = new CandidateElementSpan(termSpans,
- spanId);
- // elementRef);
- readPayload(cs);
- candidateList.add(cs);
- hasMoreSpans = termSpans.next();
- }
- Collections.sort(candidateList);
- }
+ // Set current values
+ return this.setToCurrent();
+ };
- /**
- * Reads the payloads of the termSpan and sets the end position and element
- * id from the payloads for the candidate match. The payloads for
- * character-offsets are set as the candidate match payloads. <br/>
- * <br/>
- * <em>Note</em>: payloadbuffer should actually collects all other payload
- * beside end position and element id, but KorapIndex identify element's
- * payloads by its length (8), which represents the character offset
- * payloads. So these offsets are directly set as the candidate match
- * payload.
- *
- * @param cs a candidate match
- * @throws IOException
- */
- private void readPayload(CandidateElementSpan cs) throws IOException {
- List<byte[]> payload = (List<byte[]>) termSpans.getPayload();
- int length = payload.get(0).length;
- ByteBuffer bb = ByteBuffer.allocate(length);
- bb.put(payload.get(0));
+
+ // Set term values to current
+ private boolean setToCurrent () throws IOException {
+ // Get payload
+ this.matchStartPosition = termSpans.start();
+ this.matchDocNumber = termSpans.doc();
+
+ // No need to check if there is a pl - there has to be a payload!
+ this.matchPayload = termSpans.getPayload();
+
+ List<byte[]> payload = (List<byte[]>) this.matchPayload;
if (!payload.isEmpty()) {
- // set element end position from payload
- cs.setEnd(bb.getInt(8));
- if (hasSpanId) { // copy element id
- cs.setSpanId(bb.getShort(12));
- } else { // set element id -1
- cs.setSpanId((short) -1);
- }
+ // Get payload one by one
+ int length = payload.get(0).length;
+ ByteBuffer bb = ByteBuffer.allocate(length);
+ bb.put(payload.get(0));
+
+ // set element end position from payload
+ this.matchEndPosition = bb.getInt(8);
+
+ // Copy element id
+ this.setSpanId(this.hasSpanId ? bb.getShort(12) : (short) -1);
+
// Copy the start and end character offsets
byte[] b = new byte[8];
b = Arrays.copyOfRange(bb.array(), 0, 8);
- cs.setPayloads(Collections.singletonList(b));
- } else {
- cs.setEnd(cs.getStart());
- cs.setSpanId((short) -1);
- cs.setPayloads(null);
+ this.matchPayload = Collections.singletonList(b);
}
- }
+
+ // The span is extremely short ... well ...
+ else {
+ this.matchEndPosition = this.matchStartPosition;
+ this.setSpanId((short) -1);
+ this.matchPayload = null;
+ };
+ return true;
+ };
+
@Override
public boolean skipTo(int target) throws IOException {
- if (hasMoreSpans && (firstSpans.doc() < target)) {
- if (!firstSpans.skipTo(target)) {
- candidateList.clear();
- return false;
- }
- }
- setCandidateList();
- matchPayload.clear();
- isStartEnumeration = false;
- return advance();
- }
+ if (hasMoreSpans &&
+ firstSpans.doc() < target &&
+ firstSpans.skipTo(target)) {
+ return this.setToCurrent();
+ };
+
+ hasMoreSpans = false;
+ this.matchPayload = null;
+ return false;
+ };
+
@Override
public long cost() {
return termSpans.cost();
- }
-
- /**
- * Match candidate for element spans.
- *
- * @author margaretha
- *
- */
- class CandidateElementSpan extends CandidateSpan {
-
- private short elementId;
-
- public CandidateElementSpan(Spans span, short elementId)
- throws IOException {
- super(span);
- setSpanId(elementId);
- }
-
- public void setSpanId(short elementId) {
- this.elementId = elementId;
- }
-
- public short getSpanId() {
- return elementId;
- }
- }
+ };
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
index 59b8525..00fa232 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
@@ -41,30 +41,29 @@
@Test
public void indexExample1 () throws IOException {
- KorapIndex ki = new KorapIndex();
+ KorapIndex ki = new KorapIndex();
- // abcabcabac
- FieldDocument fd = new FieldDocument();
- fd.addTV("base",
- "abcabcabac",
- "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" +
- "[(1-2)s:b|i:b|_1#1-2]" +
- "[(2-3)s:c|i:c|_2#2-3]" +
- "[(3-4)s:a|i:a|_3#3-4]" +
- "[(4-5)s:b|i:b|_4#4-5]" +
- "[(5-6)s:c|i:c|_5#5-6]" +
- "[(6-7)s:a|i:a|_6#6-7]" +
- "[(7-8)s:b|i:b|_7#7-8]" +
- "[(8-9)s:a|i:a|_8#8-9]" +
- "[(9-10)s:c|i:c|_9#9-10]");
- ki.addDoc(fd);
+ // abcabcabac
+ FieldDocument fd = new FieldDocument();
+ fd.addTV("base",
+ "abcabcabac",
+ "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" +
+ "[(1-2)s:b|i:b|_1#1-2]" +
+ "[(2-3)s:c|i:c|_2#2-3]" +
+ "[(3-4)s:a|i:a|_3#3-4]" +
+ "[(4-5)s:b|i:b|_4#4-5]" +
+ "[(5-6)s:c|i:c|_5#5-6]" +
+ "[(6-7)s:a|i:a|_6#6-7]" +
+ "[(7-8)s:b|i:b|_7#7-8]" +
+ "[(8-9)s:a|i:a|_8#8-9]" +
+ "[(9-10)s:c|i:c|_9#9-10]");
+ ki.addDoc(fd);
+ ki.commit();
- ki.commit();
+ SpanQuery sq;
+ KorapResult kr;
- SpanQuery sq;
- KorapResult kr;
-
- sq = new SpanNextQuery(
+ sq = new SpanNextQuery(
new SpanTermQuery(new Term("base", "s:a")),
new SpanTermQuery(new Term("base", "s:b"))
);
@@ -188,76 +187,74 @@
@Test
public void indexExample4 () throws IOException {
- KorapIndex ki = new KorapIndex();
+ KorapIndex ki = new KorapIndex();
- // abcabcabac
- // abc<x>abc<x>a</x>b</x>ac
- FieldDocument fd = new FieldDocument();
- fd.addString("ID", "doc-1");
- fd.addTV("base",
- "abcabcabac",
- "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" +
- "[(1-2)s:b|i:b|_1#1-2]" +
- "[(2-3)s:c|i:c|_2#2-3]" +
- "[(3-4)s:a|i:a|_3#3-4|<>:x#3-7$<i>7]" +
- "[(4-5)s:b|i:b|_4#4-5]" +
- "[(5-6)s:c|i:c|_5#5-6]" +
- "[(6-7)s:a|i:a|_6#6-7]<>:x#6-8$<i>8]" +
- "[(7-8)s:b|i:b|_7#7-8]" +
- "[(8-9)s:a|i:a|_8#8-9]" +
- "[(9-10)s:c|i:c|_9#9-10]");
- ki.addDoc(fd);
+ // abcabcabac
+ // abc<x>abc<x>a</x>b</x>ac
+ FieldDocument fd = new FieldDocument();
+ fd.addString("ID", "doc-1");
+ fd.addTV("base",
+ "abcabcabac",
+ "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" +
+ "[(1-2)s:b|i:b|_1#1-2]" +
+ "[(2-3)s:c|i:c|_2#2-3]" +
+ "[(3-4)s:a|i:a|_3#3-4|<>:x#3-7$<i>7]" +
+ "[(4-5)s:b|i:b|_4#4-5]" +
+ "[(5-6)s:c|i:c|_5#5-6]" +
+ "[(6-7)s:a|i:a|_6#6-7]<>:x#6-8$<i>8]" +
+ "[(7-8)s:b|i:b|_7#7-8]" +
+ "[(8-9)s:a|i:a|_8#8-9]" +
+ "[(9-10)s:c|i:c|_9#9-10]");
+ ki.addDoc(fd);
+
+ // xbz<x>xbzx</x>bxz
+ fd = new FieldDocument();
+ fd.addString("ID", "doc-2");
+ fd.addTV("base",
+ "xbzxbzxbxz",
+ "[(0-1)s:x|i:x|_0#0-1|-:t$<i>10]" +
+ "[(1-2)s:b|i:b|_1#1-2]" +
+ "[(2-3)s:z|i:z|_2#2-3]" +
+ "[(3-4)s:x|i:x|_3#3-4|<>:x#3-7$<i>7]" +
+ "[(4-5)s:b|i:b|_4#4-5]" +
+ "[(5-6)s:z|i:z|_5#5-6]" +
+ "[(6-7)s:x|i:x|_6#6-7]" +
+ "[(7-8)s:b|i:b|_7#7-8]" +
+ "[(8-9)s:x|i:x|_8#8-9]" +
+ "[(9-10)s:z|i:z|_9#9-10]");
+ ki.addDoc(fd);
+ ki.commit();
- // xbz<x>xbzx</x>bxz
- fd = new FieldDocument();
- fd.addString("ID", "doc-2");
- fd.addTV("base",
- "xbzxbzxbxz",
- "[(0-1)s:x|i:x|_0#0-1|-:t$<i>10]" +
- "[(1-2)s:b|i:b|_1#1-2]" +
- "[(2-3)s:z|i:z|_2#2-3]" +
- "[(3-4)s:x|i:x|_3#3-4|<>:x#3-7$<i>7]" +
- "[(4-5)s:b|i:b|_4#4-5]" +
- "[(5-6)s:z|i:z|_5#5-6]" +
- "[(6-7)s:x|i:x|_6#6-7]" +
- "[(7-8)s:b|i:b|_7#7-8]" +
- "[(8-9)s:x|i:x|_8#8-9]" +
- "[(9-10)s:z|i:z|_9#9-10]");
- ki.addDoc(fd);
+ SpanQuery sq;
+ KorapResult kr;
-
- ki.commit();
-
- SpanQuery sq;
- KorapResult kr;
-
- sq = new SpanNextQuery(
- new SpanElementQuery("base", "x"),
- new SpanTermQuery(new Term("base", "s:b"))
- );
+ sq = new SpanNextQuery(
+ new SpanElementQuery("base", "x"),
+ new SpanTermQuery(new Term("base", "s:b"))
+ );
- kr = ki.search(sq, (short) 10);
- assertEquals("TotalResults", kr.getTotalResults(), 2);
- assertEquals("abc[abcab]ac", kr.getMatch(0).getSnippetBrackets());
- assertEquals("xbz[xbzxb]xz", kr.getMatch(1).getSnippetBrackets());
+ kr = ki.search(sq, (short) 10);
+ assertEquals("TotalResults", kr.getTotalResults(), 2);
+ assertEquals("abc[abcab]ac", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("xbz[xbzxb]xz", kr.getMatch(1).getSnippetBrackets());
- sq = new SpanNextQuery(
- new SpanTermQuery(new Term("base", "s:c")),
- new SpanElementQuery("base", "x")
- );
-
- kr = ki.search(sq, (short) 10);
- assertEquals(kr.getTotalResults(), 1);
- assertEquals("ab[cabca]bac", kr.getMatch(0).getSnippetBrackets());
-
- sq = new SpanNextQuery(
- new SpanTermQuery(new Term("base", "s:z")),
- new SpanElementQuery("base", "x")
- );
-
- kr = ki.search(sq, (short) 10);
- assertEquals(kr.getTotalResults(), 1);
- assertEquals("xb[zxbzx]bxz", kr.getMatch(0).getSnippetBrackets());
+ sq = new SpanNextQuery(
+ new SpanTermQuery(new Term("base", "s:c")),
+ new SpanElementQuery("base", "x")
+ );
+
+ kr = ki.search(sq, (short) 10);
+ assertEquals(kr.getTotalResults(), 1);
+ assertEquals("ab[cabca]bac", kr.getMatch(0).getSnippetBrackets());
+
+ sq = new SpanNextQuery(
+ new SpanTermQuery(new Term("base", "s:z")),
+ new SpanElementQuery("base", "x")
+ );
+
+ kr = ki.search(sq, (short) 10);
+ assertEquals(1, kr.getTotalResults());
+ assertEquals("xb[zxbzx]bxz", kr.getMatch(0).getSnippetBrackets());
};
/**
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestSpanElementQuery.java b/src/test/java/de/ids_mannheim/korap/query/TestSpanElementQuery.java
index 3581b3b..8fab7b3 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestSpanElementQuery.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestSpanElementQuery.java
@@ -11,18 +11,28 @@
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
+/**
+ * @author diewald
+ */
+
@RunWith(JUnit4.class)
public class TestSpanElementQuery {
@Test
public void spanElementQuery () {
- SpanElementQuery sequery = new SpanElementQuery("field", "b");
- assertEquals("<field:b />", sequery.toString());
+ SpanElementQuery sequery = new SpanElementQuery("field", "b");
+ assertEquals("<field:b />", sequery.toString());
};
@Test
public void spanElement2Query () {
- SpanElementQuery sequery = new SpanElementQuery("field", "xyz");
- assertEquals("<field:xyz />", sequery.toString());
+ SpanElementQuery sequery = new SpanElementQuery("field", "xyz");
+ assertEquals("<field:xyz />", sequery.toString());
+ };
+
+ @Test
+ public void spanElement3Query () {
+ SpanElementQuery sequery = new SpanElementQuery("field", "");
+ assertEquals("<field: />", sequery.toString());
};
};