Update abstract classes for spanqueries, DistanceSpans extends SimpleSpans
diff --git a/src/main/java/de/ids_mannheim/korap/query/SpanDistanceQuery.java b/src/main/java/de/ids_mannheim/korap/query/SpanDistanceQuery.java
index 8b1e8a4..8abcb3f 100644
--- a/src/main/java/de/ids_mannheim/korap/query/SpanDistanceQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/query/SpanDistanceQuery.java
@@ -37,12 +37,12 @@
@Override
public SpanDistanceQuery clone() {
SpanDistanceQuery spanDistanceQuery = new SpanDistanceQuery(
- (SpanQuery) firstClause.clone(),
- (SpanQuery) secondClause.clone(),
- this.minDistance,
- this.maxDistance,
- this.collectPayloads
- );
+ (SpanQuery) firstClause.clone(),
+ (SpanQuery) secondClause.clone(),
+ this.minDistance,
+ this.maxDistance,
+ this.collectPayloads
+ );
spanDistanceQuery.setBoost(getBoost());
return spanDistanceQuery;
}
@@ -50,8 +50,31 @@
@Override
public Spans getSpans(AtomicReaderContext context, Bits acceptDocs,
Map<Term, TermContext> termContexts) throws IOException {
- return new DistanceSpan(this, context, acceptDocs, termContexts,
- minDistance, maxDistance);
+ return new DistanceSpan(this, context, acceptDocs, termContexts);
+ }
+
+ public int getMinDistance() {
+ return minDistance;
+ }
+
+ public void setMinDistance(int minDistance) {
+ this.minDistance = minDistance;
+ }
+
+ public int getMaxDistance() {
+ return maxDistance;
+ }
+
+ public void setMaxDistance(int maxDistance) {
+ this.maxDistance = maxDistance;
+ }
+
+ public boolean isCollectPayloads() {
+ return collectPayloads;
+ }
+
+ public void setCollectPayloads(boolean collectPayloads) {
+ this.collectPayloads = collectPayloads;
}
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/SpanNextQuery.java b/src/main/java/de/ids_mannheim/korap/query/SpanNextQuery.java
index 32a90d9..f7a9e19 100644
--- a/src/main/java/de/ids_mannheim/korap/query/SpanNextQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/query/SpanNextQuery.java
@@ -7,29 +7,17 @@
*/
import java.io.IOException;
-
-import java.util.List;
-import java.util.ArrayList;
-import java.util.Iterator;
import java.util.Map;
-import java.util.Set;
import org.apache.lucene.index.AtomicReaderContext;
-import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.search.spans.SpanQuery;
-import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.util.Bits;
import de.ids_mannheim.korap.query.spans.NextSpans;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
/** Matches spans which are directly next to each other.
* This is identical to a phrase query with exactly two clauses.
*/
@@ -57,7 +45,7 @@
public Spans getSpans (final AtomicReaderContext context, Bits acceptDocs,
Map<Term,TermContext> termContexts) throws IOException {
return (Spans) new NextSpans (this, context, acceptDocs,
- termContexts, collectPayloads);
+ termContexts);
};
@Override
diff --git a/src/main/java/de/ids_mannheim/korap/query/SpanSegmentQuery.java b/src/main/java/de/ids_mannheim/korap/query/SpanSegmentQuery.java
index 27e9a6e..3f7e487 100644
--- a/src/main/java/de/ids_mannheim/korap/query/SpanSegmentQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/query/SpanSegmentQuery.java
@@ -38,7 +38,7 @@
public Spans getSpans(AtomicReaderContext context, Bits acceptDocs,
Map<Term, TermContext> termContexts) throws IOException {
return (Spans) new SegmentSpans(this, context, acceptDocs,
- termContexts, collectPayloads);
+ termContexts);
}
@Override
@@ -72,7 +72,7 @@
public int hashCode() {
int result;
result = firstClause.hashCode() + secondClause.hashCode();
- result ^= (31 << result) | (result >>> 2);
+ result ^= (31 * result) + (result >>> 3);
result += Float.floatToRawIntBits(getBoost());
return result;
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/DistanceSpan.java b/src/main/java/de/ids_mannheim/korap/query/spans/DistanceSpan.java
index 022f5d4..0ae296f 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/DistanceSpan.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/DistanceSpan.java
@@ -2,73 +2,39 @@
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Collection;
-import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
-import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import de.ids_mannheim.korap.query.SimpleSpanQuery;
import de.ids_mannheim.korap.query.SpanDistanceQuery;
-public class DistanceSpan extends Spans{
-
- private boolean isStartEnumeration;
- private boolean hasMoreSpans;
+public class DistanceSpan extends SimpleSpans{
+
private boolean hasMoreFirstSpans;
private boolean collectPayloads;
private int minDistance,maxDistance;
-
- protected int doc, start, end;
- private List<byte[]> payload;
-
- private SpanDistanceQuery query;
- protected Spans firstSpans, secondSpans;
+
private List<CandidateSpan> candidateList;
private int candidateListIndex;
private int candidateListDocNum;
- private Logger log = LoggerFactory.getLogger(SimpleSpans.class);
+ private Logger log = LoggerFactory.getLogger(DistanceSpan.class);
- public DistanceSpan(SpanDistanceQuery spanDistanceQuery,
+ public DistanceSpan(SpanDistanceQuery query,
AtomicReaderContext context, Bits acceptDocs,
- Map<Term, TermContext> termContexts,
- int minDistance,
- int maxDistance)
- throws IOException {
- this(spanDistanceQuery, context, acceptDocs, termContexts,
- minDistance, maxDistance, true);
- }
-
- public DistanceSpan(SpanDistanceQuery spanDistanceQuery,
- AtomicReaderContext context, Bits acceptDocs,
- Map<Term, TermContext> termContexts,
- int minDistance,
- int maxDistance,
- boolean collectPayloads)
- throws IOException {
+ Map<Term, TermContext> termContexts)
+ throws IOException {
+ super(query, context, acceptDocs, termContexts);
- this.query = spanDistanceQuery;
- this.minDistance = minDistance;
- this.maxDistance = maxDistance;
- this.collectPayloads = collectPayloads; // TODO: always true ?
- this.payload = new LinkedList<byte[]>();
- this.doc = -1;
- this.start = -1;
- this.end = -1;
-
- // Get the enumeration of the two spans to match
- firstSpans = spanDistanceQuery.getFirstClause().
- getSpans(context, acceptDocs, termContexts);
- secondSpans = spanDistanceQuery.getSecondClause().
- getSpans(context, acceptDocs, termContexts);
+ minDistance = query.getMinDistance();
+ maxDistance = query.getMaxDistance();
+ collectPayloads = query.isCollectPayloads();
hasMoreFirstSpans = firstSpans.next();
hasMoreSpans = hasMoreFirstSpans;
@@ -76,14 +42,12 @@
candidateList = new ArrayList<>();
candidateListIndex = -1;
candidateListDocNum = firstSpans.doc();
-
- isStartEnumeration=true;
}
@Override
public boolean next() throws IOException {
isStartEnumeration=false;
- payload.clear();
+ matchPayload.clear();
return advance();
}
@@ -138,30 +102,6 @@
hasMoreFirstSpans = firstSpans.next();
}
}
-
-
-
- /** Skip the current first or second span until both the spans are in the same doc.
- * @return true iff the first and second spans are in the same doc.
- * */
- private boolean ensureSameDoc() throws IOException {
- while (firstSpans.doc() != secondSpans.doc()) {
- if (firstSpans.doc() < secondSpans.doc()){
- if (!firstSpans.skipTo(secondSpans.doc())){
- hasMoreSpans = false;
- return false;
- }
- }
- else {
- if (!secondSpans.skipTo(firstSpans.doc())){
- hasMoreSpans = false;
- return false;
- }
- }
- }
- return true;
- }
-
protected boolean findMatch() throws IOException {
CandidateSpan candidateSpan = candidateList.get(candidateListIndex);
@@ -170,8 +110,8 @@
candidateSpan.getStart() < secondSpans.end() &&
secondSpans.start() < candidateSpan.getEnd()){
- this.start = Math.min(candidateSpan.getStart(), secondSpans.start());
- this.end = Math.max(candidateSpan.getEnd(), secondSpans.end());
+ matchStartPosition = Math.min(candidateSpan.getStart(), secondSpans.start());
+ matchEndPosition = Math.max(candidateSpan.getEnd(), secondSpans.end());
setDocAndPayload(candidateSpan);
return true;
}
@@ -181,8 +121,8 @@
minDistance <= actualDistance &&
actualDistance <= maxDistance){
- this.start = candidateSpan.getStart();
- this.end = secondSpans.end();
+ matchStartPosition = candidateSpan.getStart();
+ matchEndPosition = secondSpans.end();
setDocAndPayload(candidateSpan);
return true;
}
@@ -190,15 +130,15 @@
}
private void setDocAndPayload(CandidateSpan candidateSpan) throws IOException{
- this.doc = secondSpans.doc();
+ this.matchDocNumber = secondSpans.doc();
if (collectPayloads){
if (candidateSpan.getPayloads() != null) {
- payload.addAll(candidateSpan.getPayloads());
- log.trace("first",payload.size());
+ matchPayload.addAll(candidateSpan.getPayloads());
+ log.trace("first",matchPayload.size());
}
if (secondSpans.isPayloadAvailable()) {
- payload.addAll(secondSpans.getPayload());
- log.trace("second",payload.size());
+ matchPayload.addAll(secondSpans.getPayload());
+ log.trace("second",matchPayload.size());
}
}
}
@@ -213,37 +153,12 @@
}
setCandidateList();
- payload.clear();
+ matchPayload.clear();
isStartEnumeration=false;
return advance();
}
@Override
- public int doc() {
- return this.doc;
- }
-
- @Override
- public int start() {
- return this.start;
- }
-
- @Override
- public int end() {
- return this.end;
- }
-
- @Override
- public Collection<byte[]> getPayload() throws IOException {
- return this.payload;
- }
-
- @Override
- public boolean isPayloadAvailable() throws IOException {
- return !this.payload.isEmpty();
- }
-
- @Override
public long cost() {
CandidateSpan candidateSpan = candidateList.get(candidateListIndex);
return candidateSpan.getCost() + secondSpans.cost();
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
index e594c7f..278f050 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
@@ -13,30 +13,24 @@
import de.ids_mannheim.korap.query.SimpleSpanQuery;
/** NextSpans is an enumeration of Span matches, which ensures that
- * a span is immediately followed by another span.
- *
+ * a span is immediately followed by another span.
+ *
+ * TODO: nextSpans needs collectPayloads to be explicitly set true. Why?
* @author margaretha
* */
-public class NextSpans extends SimpleSpans {
+public class NextSpans extends NonPartialOverlappingSpans {
public NextSpans (SimpleSpanQuery simpleSpanQuery,
AtomicReaderContext context,
Bits acceptDocs,
Map<Term,TermContext> termContexts) throws IOException {
- this(simpleSpanQuery, context, acceptDocs, termContexts, true);
+ super(simpleSpanQuery, context, acceptDocs, termContexts);
}
-
- public NextSpans (SimpleSpanQuery simpleSpanQuery,
- AtomicReaderContext context,
- Bits acceptDocs,
- Map<Term,TermContext> termContexts,
- boolean collectPayloads) throws IOException {
- super(simpleSpanQuery, context, acceptDocs, termContexts,collectPayloads);
- }
/** Check weather the end position of the current firstspan equals
* the start position of the secondspan.
* */
+ @Override
protected int findMatch() {
if (firstSpans.end() == secondSpans.start()) {
matchDocNumber = firstSpans.doc();
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/NonPartialOverlappingSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/NonPartialOverlappingSpans.java
new file mode 100644
index 0000000..3be205e
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/NonPartialOverlappingSpans.java
@@ -0,0 +1,113 @@
+package de.ids_mannheim.korap.query.spans;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.util.Bits;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import de.ids_mannheim.korap.query.SimpleSpanQuery;
+
+/** An abstract class for Span enumeration whose two child spans are matched by
+ * their positions and do not have a partial overlap.
+ *
+ * @author margaretha
+ *
+ * */
+public abstract class NonPartialOverlappingSpans extends SimpleSpans{
+
+ private Logger log = LoggerFactory.getLogger(NonPartialOverlappingSpans.class);
+ protected boolean collectPayloads;
+
+ public NonPartialOverlappingSpans(SimpleSpanQuery simpleSpanQuery,
+ AtomicReaderContext context, Bits acceptDocs,
+ Map<Term,TermContext> termContexts) throws IOException {
+ super(simpleSpanQuery, context, acceptDocs, termContexts);
+
+ // Warning: not implemented, results in errors for SpanNextQuery
+ // This.collectPayloads = simpleSpanQuery.isCollectPayloads()
+ this.collectPayloads = true;
+ hasMoreSpans = secondSpans.next();
+
+ }
+
+ @Override
+ public boolean next() throws IOException {
+ // Warning: this does not work for overlapping spans
+ // e.g. get multiple second spans in a firstspan
+ hasMoreSpans &= firstSpans.next();
+ isStartEnumeration=false;
+ matchPayload.clear();
+ return advance();
+ }
+
+ /** Advance is a lucene terminology to search for the next match.
+ * */
+ protected boolean advance() throws IOException {
+ // The complexity is linear for searching in a document.
+ // It's better if we can skip to >= position in a document.
+ while (hasMoreSpans && ensureSameDoc()){
+ int matchCase = findMatch();
+ if (matchCase == 0){
+ log.trace("Match doc#: {}",matchDocNumber);
+ log.trace("Match positions: {}-{}", matchStartPosition,
+ matchEndPosition);
+ doCollectPayloads();
+ return true;
+ }
+ else if (matchCase == 1) {
+ hasMoreSpans = secondSpans.next();
+ }
+ else{
+ hasMoreSpans = firstSpans.next();
+ }
+ }
+ return false;
+ }
+
+ /** Specify the condition for a match
+ * @return 0 iff match is found,
+ * -1 to advance the firstspan,
+ * 1 to advance the secondspan
+ * */
+ protected abstract int findMatch();
+
+ /** Collecting available payloads from the current first and second spans */
+ private void doCollectPayloads() throws IOException {
+ if (collectPayloads){
+ log.trace("Collect payloads");
+ if (firstSpans.isPayloadAvailable()) {
+ Collection<byte[]> payload = firstSpans.getPayload();
+ log.trace("Found {} payloads in firstSpans", payload.size());
+ matchPayload.addAll(payload);
+ }
+ if (secondSpans.isPayloadAvailable()) {
+ Collection<byte[]> payload = secondSpans.getPayload();
+ log.trace("Found {} payloads in secondSpans", payload.size());
+ matchPayload.addAll(payload);
+ }
+ }
+ }
+
+ @Override
+ public boolean skipTo(int target) throws IOException {
+ if (hasMoreSpans && (firstSpans.doc() < target)){
+ if (!firstSpans.skipTo(target)){
+ hasMoreSpans = false;
+ return false;
+ }
+ }
+ matchPayload.clear();
+ return advance();
+ }
+
+ @Override
+ public long cost() {
+ return firstSpans.cost() + secondSpans.cost();
+ }
+}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/SegmentSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/SegmentSpans.java
index 7373d15..136bdd7 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/SegmentSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/SegmentSpans.java
@@ -8,34 +8,27 @@
import org.apache.lucene.index.TermContext;
import org.apache.lucene.util.Bits;
-import de.ids_mannheim.korap.query.SimpleSpanQuery;
import de.ids_mannheim.korap.query.SpanSegmentQuery;
/** SegmentSpans is an enumeration of Span matches, which ensures that two spans
- * have exactly the same start and end positions.
+ * have exactly the same start and end positions. It also represents the span
+ * match object. This is not very neat, but that is the Lucene's design.
*
* @author margaretha
* */
-public class SegmentSpans extends SimpleSpans {
+public class SegmentSpans extends NonPartialOverlappingSpans {
public SegmentSpans (SpanSegmentQuery spanSegmentQuery,
AtomicReaderContext context,
Bits acceptDocs,
Map<Term,TermContext> termContexts) throws IOException {
- this(spanSegmentQuery, context, acceptDocs, termContexts, true);
+ super(spanSegmentQuery, context, acceptDocs, termContexts);
}
-
- public SegmentSpans (SpanSegmentQuery spanSegmentQuery,
- AtomicReaderContext context,
- Bits acceptDocs,
- Map<Term,TermContext> termContexts,
- boolean collectPayloads) throws IOException {
- super(spanSegmentQuery, context, acceptDocs, termContexts,collectPayloads);
- }
/** Check weather the start and end positions of the current
* firstspan and secondspan are identical.
* */
+ @Override
protected int findMatch() {
if (firstSpans.start() == secondSpans.start() &&
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
index 0db257d..239902f 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
@@ -9,46 +9,36 @@
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
-import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import de.ids_mannheim.korap.query.SimpleSpanQuery;
-/** An abstract class for Span enumeration whose two child spans are matched by
- * their positions and do not have a partial overlap.
+/** An abstract class for Span enumeration including span match properties
+ * and basic methods;
*
* @author margaretha
*
* */
public abstract class SimpleSpans extends Spans{
- private boolean isStartEnumeration;
- private boolean hasMoreSpans;
+ protected boolean isStartEnumeration;
+ protected boolean hasMoreSpans;
protected int matchDocNumber, matchStartPosition, matchEndPosition;
- private List<byte[]> matchPayload;
- private boolean collectPayloads;
+ protected List<byte[]> matchPayload;
// Warning: enumeration of Spans
protected Spans firstSpans, secondSpans;
- private SimpleSpanQuery query;
-
- private Logger log = LoggerFactory.getLogger(SimpleSpans.class);
+ private SimpleSpanQuery query;
public SimpleSpans (SimpleSpanQuery simpleSpanQuery,
- AtomicReaderContext context,
- Bits acceptDocs,
- Map<Term,TermContext> termContexts,
- boolean collectPayloads) throws IOException {
-
- // Initialize as invalid
+ AtomicReaderContext context,
+ Bits acceptDocs,
+ Map<Term,TermContext> termContexts) throws IOException {
+
+ query = simpleSpanQuery;
matchDocNumber= -1;
matchStartPosition= -1;
matchEndPosition= -1;
-
- // TODO: what is this
- this.collectPayloads = true;
matchPayload = new LinkedList<byte[]>();
// Get the enumeration of the two spans to match
@@ -56,53 +46,9 @@
getSpans(context, acceptDocs, termContexts);
secondSpans = simpleSpanQuery.getSecondClause().
getSpans(context, acceptDocs, termContexts);
-
- query = simpleSpanQuery;
- hasMoreSpans = secondSpans.next();
+
isStartEnumeration=true;
}
-
- @Override
- public boolean next() throws IOException {
- // Warning: this does not work for overlapping spans
- // e.g. get multiple second spans in a firstspan
- hasMoreSpans &= firstSpans.next();
- isStartEnumeration=false;
- matchPayload.clear();
- return advance();
- }
-
- /** Advance is a lucene terminology to search for the next match.
- * */
- private boolean advance() throws IOException {
- // The complexity is linear for searching in a document.
- // It's better if we can skip to >= position in a document.
- while (hasMoreSpans && ensureSameDoc()){
- int matchCase = findMatch();
- if (matchCase == 0){
- log.trace("Match doc#: {}",matchDocNumber);
- log.trace("Match positions: {}-{}", matchStartPosition,
- matchEndPosition);
- doCollectPayloads();
- return true;
- }
- else if (matchCase == 1) {
- hasMoreSpans = secondSpans.next();
- }
- else{
- hasMoreSpans = firstSpans.next();
- }
- }
- return false;
- }
-
- /** Specify the condition for a match
- * @return 0 iff match is found,
- * -1 to advance the firstspan,
- * 1 to advance the secondspan
- * */
- protected abstract int findMatch();
-
/** If the current firstspan and secondspan are not in the same document,
* try to skip the span with the smaller document number, to the same
@@ -111,7 +57,7 @@
* doc, OR until reaching the last document.
* @return true iff such a document exists.
* */
- private boolean ensureSameDoc() throws IOException {
+ protected boolean ensureSameDoc() throws IOException {
while (firstSpans.doc() != secondSpans.doc()) {
if (firstSpans.doc() < secondSpans.doc()){
if (!firstSpans.skipTo(secondSpans.doc())){
@@ -127,36 +73,7 @@
}
}
return true;
- }
-
- /** Collecting available payloads from the current first and second spans */
- private void doCollectPayloads() throws IOException {
- if (collectPayloads){
- log.trace("Collect payloads");
- if (firstSpans.isPayloadAvailable()) {
- Collection<byte[]> payload = firstSpans.getPayload();
- log.trace("Found {} payloads in firstSpans", payload.size());
- matchPayload.addAll(payload);
- }
- if (secondSpans.isPayloadAvailable()) {
- Collection<byte[]> payload = secondSpans.getPayload();
- log.trace("Found {} payloads in secondSpans", payload.size());
- matchPayload.addAll(payload);
- }
- }
- }
-
- @Override
- public boolean skipTo(int target) throws IOException {
- if (hasMoreSpans && (firstSpans.doc() < target)){
- if (!firstSpans.skipTo(target)){
- hasMoreSpans = false;
- return false;
- }
- }
- matchPayload.clear();
- return advance();
- }
+ }
@Override
public int doc() {
@@ -182,11 +99,6 @@
public boolean isPayloadAvailable() throws IOException {
return !matchPayload.isEmpty();
}
-
- @Override
- public long cost() {
- return firstSpans.cost() + secondSpans.cost();
- }
@Override
public String toString() { // who does call this?