Abstract classes for distance Spans
Token-based distance span
diff --git a/src/main/java/de/ids_mannheim/korap/query/SpanDistanceQuery.java b/src/main/java/de/ids_mannheim/korap/query/SpanDistanceQuery.java
index 8abcb3f..5ce6bb5 100644
--- a/src/main/java/de/ids_mannheim/korap/query/SpanDistanceQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/query/SpanDistanceQuery.java
@@ -10,19 +10,20 @@
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
-import de.ids_mannheim.korap.query.spans.DistanceSpan;
+import de.ids_mannheim.korap.query.spans.ElementDistanceSpan;
+import de.ids_mannheim.korap.query.spans.TokenDistanceSpan;
/** Match two ordered Spans with minimum and maximum distance constraints.
- * In this query, the distance unit is the difference between two
- * token positions.
+ * The distance unit can be word (token), sentence or paragraph.
*
* @author margaretha
* */
public class SpanDistanceQuery extends SimpleSpanQuery {
- private int minDistance, maxDistance; // token positions
- private boolean collectPayloads;
- private SpanQuery firstClause, secondClause;
+ protected int minDistance, maxDistance;
+ protected boolean collectPayloads;
+ protected SpanQuery firstClause, secondClause;
+ private SpanQuery elementQuery; // element distance unit
public SpanDistanceQuery(SpanQuery firstClause, SpanQuery secondClause,
int minDistance, int maxDistance, boolean collectPayloads) {
@@ -34,6 +35,15 @@
this.collectPayloads = collectPayloads;
}
+ public SpanDistanceQuery(SpanQuery elementQuery,
+ SpanQuery firstClause, SpanQuery secondClause,
+ int minDistance, int maxDistance,
+ boolean collectPayloads) {
+ this(firstClause, secondClause,minDistance, maxDistance,
+ collectPayloads);
+ this.elementQuery = elementQuery;
+ }
+
@Override
public SpanDistanceQuery clone() {
SpanDistanceQuery spanDistanceQuery = new SpanDistanceQuery(
@@ -43,14 +53,23 @@
this.maxDistance,
this.collectPayloads
);
+
+ if (this.elementQuery != null) {
+ spanDistanceQuery.setElementQuery(this.elementQuery);
+ }
+
spanDistanceQuery.setBoost(getBoost());
return spanDistanceQuery;
}
@Override
public Spans getSpans(AtomicReaderContext context, Bits acceptDocs,
- Map<Term, TermContext> termContexts) throws IOException {
- return new DistanceSpan(this, context, acceptDocs, termContexts);
+ Map<Term, TermContext> termContexts) throws IOException {
+
+ if (this.elementQuery != null)
+ return new ElementDistanceSpan(this, context, acceptDocs, termContexts);
+
+ return new TokenDistanceSpan(this, context, acceptDocs, termContexts);
}
public int getMinDistance() {
@@ -77,4 +96,14 @@
this.collectPayloads = collectPayloads;
}
+
+ public SpanQuery getElementQuery() {
+ return elementQuery;
+ }
+
+
+ public void setElementQuery(SpanQuery elementQuery) {
+ this.elementQuery = elementQuery;
+ }
+
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java b/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java
index dd49894..1db18ef 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java
@@ -12,6 +12,7 @@
private int doc,start,end;
private long cost;
private Collection<byte[]> payloads;
+ private int position;
public CandidateSpan(Spans span) throws IOException {
this.doc = span.doc();
@@ -25,6 +26,11 @@
else{
this.payloads = null;
}
+ }
+
+ public CandidateSpan(Spans span, int position) throws IOException {
+ this(span);
+ this.position = position;
}
public int getDoc() {
@@ -61,6 +67,14 @@
public void setCost(long cost) {
this.cost = cost;
}
+
+ public int getPosition() {
+ return position;
+ }
+
+ public void setPosition(int position) {
+ this.position = position;
+ }
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/DistanceSpan.java b/src/main/java/de/ids_mannheim/korap/query/spans/DistanceSpan.java
index 0ae296f..103fd14 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/DistanceSpan.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/DistanceSpan.java
@@ -14,15 +14,21 @@
import de.ids_mannheim.korap.query.SpanDistanceQuery;
-public class DistanceSpan extends SimpleSpans{
+/** DistanceSpan is a base class for enumeration of span matches,
+ * whose two child spans have a specific range of distance (within
+ * a min and a max distance).
+ *
+ * @author margaretha
+ * */
+public abstract class DistanceSpan extends SimpleSpans{
- private boolean hasMoreFirstSpans;
- private boolean collectPayloads;
- private int minDistance,maxDistance;
+ protected boolean hasMoreFirstSpans;
+ protected boolean collectPayloads;
+ protected int minDistance,maxDistance;
- private List<CandidateSpan> candidateList;
- private int candidateListIndex;
- private int candidateListDocNum;
+ protected List<CandidateSpan> candidateList;
+ protected int candidateListIndex;
+ protected int candidateListDocNum;
private Logger log = LoggerFactory.getLogger(DistanceSpan.class);
@@ -37,7 +43,6 @@
collectPayloads = query.isCollectPayloads();
hasMoreFirstSpans = firstSpans.next();
- hasMoreSpans = hasMoreFirstSpans;
candidateList = new ArrayList<>();
candidateListIndex = -1;
@@ -51,6 +56,8 @@
return advance();
}
+ /** Find a span match in the candidate list.
+ * */
private boolean advance() throws IOException {
while( hasMoreSpans && candidateListIndex < candidateList.size() ){
// Check candidates
@@ -66,81 +73,43 @@
return false;
}
- private void setCandidateList() throws IOException{
- if (candidateListDocNum == secondSpans.doc()){
- copyPossibleCandidates();
- addNewCandidates();
- candidateListIndex = -1;
- }
- else {
- candidateList.clear();
- if (hasMoreFirstSpans && ensureSameDoc()){
- candidateListDocNum = firstSpans.doc();
- addNewCandidates();
- candidateListIndex = -1;
- }
- }
- }
+ /** Collect all possible firstspan instances as candidate spans for
+ * the current secondspan. The candidate spans are within the max
+ * distance from the current secondspan.
+ *
+ * */
+ protected abstract void setCandidateList() throws IOException;
- private void copyPossibleCandidates(){
- List<CandidateSpan> temp = new ArrayList<>();
- for (CandidateSpan cs : candidateList){
- if (cs.getEnd()+maxDistance > secondSpans.start())
- temp.add(cs);
- }
- candidateList = temp;
- }
+ /** Define the conditions for a match.
+ * */
+ protected abstract boolean findMatch() throws IOException;
- private void addNewCandidates() throws IOException{
- while ( hasMoreFirstSpans &&
- firstSpans.doc() == candidateListDocNum &&
- firstSpans.start() < secondSpans.end()){
-
- if (firstSpans.end()+maxDistance > secondSpans.start())
- candidateList.add(new CandidateSpan(firstSpans));
-
- hasMoreFirstSpans = firstSpans.next();
- }
- }
-
- protected boolean findMatch() throws IOException {
- CandidateSpan candidateSpan = candidateList.get(candidateListIndex);
- if (minDistance == 0 &&
- // intersection
- candidateSpan.getStart() < secondSpans.end() &&
- secondSpans.start() < candidateSpan.getEnd()){
-
+ /** Define the properties of a span match.
+ * */
+ protected void setMatchProperties(CandidateSpan candidateSpan,
+ boolean isDistanceZero) throws IOException{
+
+ if (isDistanceZero){
matchStartPosition = Math.min(candidateSpan.getStart(), secondSpans.start());
matchEndPosition = Math.max(candidateSpan.getEnd(), secondSpans.end());
- setDocAndPayload(candidateSpan);
- return true;
}
-
- int actualDistance = secondSpans.start() - candidateSpan.getEnd() +1;
- if (candidateSpan.getStart() < secondSpans.start() &&
- minDistance <= actualDistance &&
- actualDistance <= maxDistance){
-
+ else {
matchStartPosition = candidateSpan.getStart();
matchEndPosition = secondSpans.end();
- setDocAndPayload(candidateSpan);
- return true;
- }
- return false;
- }
-
- private void setDocAndPayload(CandidateSpan candidateSpan) throws IOException{
+ }
+
this.matchDocNumber = secondSpans.doc();
if (collectPayloads){
if (candidateSpan.getPayloads() != null) {
- matchPayload.addAll(candidateSpan.getPayloads());
- log.trace("first",matchPayload.size());
+ matchPayload.addAll(candidateSpan.getPayloads());
}
if (secondSpans.isPayloadAvailable()) {
- matchPayload.addAll(secondSpans.getPayload());
- log.trace("second",matchPayload.size());
+ matchPayload.addAll(secondSpans.getPayload());
}
}
+
+ log.trace("doc# {}, start {}, end {}",matchDocNumber,matchStartPosition,
+ matchEndPosition);
}
@Override
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/NonPartialOverlappingSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/NonPartialOverlappingSpans.java
index 3be205e..e18c04c 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/NonPartialOverlappingSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/NonPartialOverlappingSpans.java
@@ -51,7 +51,7 @@
protected boolean advance() throws IOException {
// The complexity is linear for searching in a document.
// It's better if we can skip to >= position in a document.
- while (hasMoreSpans && ensureSameDoc()){
+ while (hasMoreSpans && ensureSameDoc(firstSpans,secondSpans)){
int matchCase = findMatch();
if (matchCase == 0){
log.trace("Match doc#: {}",matchDocNumber);
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
index 239902f..189ee8c 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
@@ -50,23 +50,23 @@
isStartEnumeration=true;
}
- /** If the current firstspan and secondspan are not in the same document,
- * try to skip the span with the smaller document number, to the same
- * OR a greater document number than, the document number of the other
- * span. Do this until the firstspan and the secondspan are in the same
- * doc, OR until reaching the last document.
+ /** If the current x and y are not in the same document, to skip the
+ * span with the smaller document number, to the same OR a greater
+ * document number than, the document number of the other span. Do
+ * this until the x and the y are in the same doc, OR until the last
+ * document.
* @return true iff such a document exists.
* */
- protected boolean ensureSameDoc() throws IOException {
- while (firstSpans.doc() != secondSpans.doc()) {
- if (firstSpans.doc() < secondSpans.doc()){
- if (!firstSpans.skipTo(secondSpans.doc())){
+ protected boolean ensureSameDoc(Spans x, Spans y) throws IOException {
+ while (x.doc() != y.doc()) {
+ if (x.doc() < y.doc()){
+ if (!x.skipTo(y.doc())){
hasMoreSpans = false;
return false;
}
}
else {
- if (!secondSpans.skipTo(firstSpans.doc())){
+ if (!y.skipTo(x.doc())){
hasMoreSpans = false;
return false;
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/TokenDistanceSpan.java b/src/main/java/de/ids_mannheim/korap/query/spans/TokenDistanceSpan.java
new file mode 100644
index 0000000..5990029
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/TokenDistanceSpan.java
@@ -0,0 +1,96 @@
+package de.ids_mannheim.korap.query.spans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.util.Bits;
+
+import de.ids_mannheim.korap.query.SpanDistanceQuery;
+
+/** Enumeration of token-based distance span matches.
+ * Each match consists of two specified spans having an actual distance
+ * in the range of the min and max distance parameters given in the query.
+ *
+ * @author margaretha
+ * */
+public class TokenDistanceSpan extends DistanceSpan{
+
+ public TokenDistanceSpan(SpanDistanceQuery query,
+ AtomicReaderContext context, Bits acceptDocs,
+ Map<Term, TermContext> termContexts) throws IOException {
+ super(query, context, acceptDocs, termContexts);
+ hasMoreSpans = hasMoreFirstSpans;
+ }
+
+ @Override
+ protected void setCandidateList() throws IOException{
+ if (candidateListDocNum == secondSpans.doc()){
+ copyPossibleCandidates();
+ addNewCandidates();
+ candidateListIndex = -1;
+ }
+ else {
+ candidateList.clear();
+ if (hasMoreFirstSpans && ensureSameDoc(firstSpans,secondSpans)){
+ candidateListDocNum = firstSpans.doc();
+ addNewCandidates();
+ candidateListIndex = -1;
+ }
+ }
+ }
+
+ /** Copy candidate spans which are still possible to create a match,
+ * from the candidate list prepared for the previous second spans.
+ * */
+ private void copyPossibleCandidates(){
+ List<CandidateSpan> temp = new ArrayList<>();
+ for (CandidateSpan cs : candidateList){
+ if (cs.getEnd()+maxDistance > secondSpans.start())
+ temp.add(cs);
+ }
+ candidateList = temp;
+ }
+
+ /** Add new possible candidates for the current secondspan.
+ * */
+ private void addNewCandidates() throws IOException{
+ while ( hasMoreFirstSpans &&
+ firstSpans.doc() == candidateListDocNum &&
+ firstSpans.start() < secondSpans.end()){
+
+ if (firstSpans.end()+maxDistance > secondSpans.start())
+ candidateList.add(new CandidateSpan(firstSpans));
+
+ hasMoreFirstSpans = firstSpans.next();
+ }
+ }
+
+ @Override
+ protected boolean findMatch() throws IOException {
+ CandidateSpan candidateSpan = candidateList.get(candidateListIndex);
+ if (minDistance == 0 &&
+ // intersection
+ candidateSpan.getStart() < secondSpans.end() &&
+ secondSpans.start() < candidateSpan.getEnd()){
+
+ setMatchProperties(candidateSpan, true);
+ return true;
+ }
+
+ int actualDistance = secondSpans.start() - candidateSpan.getEnd() +1;
+ if (candidateSpan.getStart() < secondSpans.start() &&
+ minDistance <= actualDistance &&
+ actualDistance <= maxDistance){
+
+ setMatchProperties(candidateSpan, false);
+ return true;
+ }
+ return false;
+ }
+
+}