Update Distance query / span classes
diff --git a/src/main/java/de/ids_mannheim/korap/query/DistanceConstraint.java b/src/main/java/de/ids_mannheim/korap/query/DistanceConstraint.java
new file mode 100644
index 0000000..686f3dc
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/DistanceConstraint.java
@@ -0,0 +1,46 @@
+package de.ids_mannheim.korap.query;
+
+import org.apache.lucene.search.spans.SpanQuery;
+
+public class DistanceConstraint {
+ int minDistance, maxDistance;
+ String unit;
+ SpanQuery elementQuery;
+
+ public DistanceConstraint(String unit, int min, int max) {
+ this.unit = unit;
+ this.minDistance = min;
+ this.maxDistance = max;
+ }
+
+ public DistanceConstraint(SpanQuery elementQuery, String unit,
+ int min, int max) {
+ this(unit, min, max);
+ this.elementQuery = elementQuery;
+ }
+
+ public int getMinDistance() {
+ return minDistance;
+ }
+ public void setMinDistance(int minDistance) {
+ this.minDistance = minDistance;
+ }
+ public int getMaxDistance() {
+ return maxDistance;
+ }
+ public void setMaxDistance(int maxDistance) {
+ this.maxDistance = maxDistance;
+ }
+ public String getUnit() {
+ return unit;
+ }
+ public void setUnit(String unit) {
+ this.unit = unit;
+ }
+ public SpanQuery getElementQuery() {
+ return elementQuery;
+ }
+ public void setElementQuery(SpanQuery elementQuery) {
+ this.elementQuery = elementQuery;
+ }
+}
diff --git a/src/main/java/de/ids_mannheim/korap/query/SpanDistanceQuery.java b/src/main/java/de/ids_mannheim/korap/query/SpanDistanceQuery.java
index 2a56dec..634a887 100644
--- a/src/main/java/de/ids_mannheim/korap/query/SpanDistanceQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/query/SpanDistanceQuery.java
@@ -30,8 +30,8 @@
public SpanDistanceQuery(SpanQuery firstClause, SpanQuery secondClause,
int minDistance, int maxDistance, boolean isOrdered,
- boolean collectPayloads) {
- super(firstClause, secondClause, "spanDistance");
+ boolean collectPayloads) {
+ super(firstClause, secondClause, "spanDistance");
this.firstClause=firstClause;
this.secondClause=secondClause;
this.minDistance =minDistance;
@@ -42,12 +42,22 @@
public SpanDistanceQuery(SpanQuery elementQuery, SpanQuery firstClause,
SpanQuery secondClause, int minDistance, int maxDistance,
- boolean isOrdered, boolean collectPayloads) {
- this(firstClause, secondClause, minDistance, maxDistance, isOrdered,
- collectPayloads);
+ boolean isOrdered, boolean collectPayloads) {
+ super(firstClause, secondClause, "spanElementDistance");
+ this.firstClause=firstClause;
+ this.secondClause=secondClause;
+ this.minDistance =minDistance;
+ this.maxDistance = maxDistance;
+ this.isOrdered = isOrdered;
+ this.collectPayloads = collectPayloads;
this.elementQuery = elementQuery;
}
+ public SpanDistanceQuery(SpanQuery firstClause2, SpanQuery secondClause2,
+ String string) {
+ super(firstClause2, secondClause2, string);
+ }
+
@Override
public SpanDistanceQuery clone() {
SpanDistanceQuery spanDistanceQuery = new SpanDistanceQuery(
@@ -106,12 +116,10 @@
public void setCollectPayloads(boolean collectPayloads) {
this.collectPayloads = collectPayloads;
}
-
public SpanQuery getElementQuery() {
return elementQuery;
}
-
public void setElementQuery(SpanQuery elementQuery) {
this.elementQuery = elementQuery;
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/DistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/DistanceSpans.java
index d87ead2..6bc0c23 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/DistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/DistanceSpans.java
@@ -8,10 +8,12 @@
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
+import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import de.ids_mannheim.korap.query.SimpleSpanQuery;
import de.ids_mannheim.korap.query.SpanDistanceQuery;
/** DistanceSpan is a base class for enumeration of span matches,
@@ -22,120 +24,61 @@
* @author margaretha
* */
public abstract class DistanceSpans extends SimpleSpans{
-
- protected boolean hasMoreFirstSpans;
- protected boolean collectPayloads;
- protected int minDistance,maxDistance;
- protected List<CandidateSpan> candidateList;
- protected int candidateListIndex;
- protected int candidateListDocNum;
-
- private Logger log = LoggerFactory.getLogger(DistanceSpans.class);
+ protected CandidateSpan matchFirstSpan,matchSecondSpan;
+ protected Logger log = LoggerFactory.getLogger(DistanceSpans.class);
public DistanceSpans(SpanDistanceQuery query,
AtomicReaderContext context, Bits acceptDocs,
- Map<Term, TermContext> termContexts)
- throws IOException {
+ Map<Term, TermContext> termContexts) throws IOException {
super(query, context, acceptDocs, termContexts);
-
- minDistance = query.getMinDistance();
- maxDistance = query.getMaxDistance();
- collectPayloads = query.isCollectPayloads();
-
- hasMoreFirstSpans = firstSpans.next();
-
- candidateList = new ArrayList<>();
- candidateListIndex = -1;
- candidateListDocNum = firstSpans.doc();
- }
+ }
@Override
public boolean next() throws IOException {
isStartEnumeration=false;
matchPayload.clear();
return advance();
- }
-
- /** Find a span match in the candidate list.
+ }
+
+
+ /** Find the next span match.
+ * @return true iff a span match is available.
* */
- private boolean advance() throws IOException {
- while( hasMoreSpans && candidateListIndex < candidateList.size() ){
- // Check candidates
- for (candidateListIndex++;candidateListIndex < candidateList.size();
- candidateListIndex++){
- if (findMatch())
- return true;
+ protected abstract boolean advance() throws IOException;
+
+ /** Find the same doc shared by element, firstspan and secondspan.
+ * @return true iff such a doc is found.
+ * */
+ protected boolean findSameDoc(Spans x,
+ Spans y, Spans e) throws IOException{
+
+ while (hasMoreSpans) {
+ if (ensureSameDoc(x, y) &&
+ e.doc() == x.doc()){
+ return true;
}
-
- do { // Forward secondspan
- hasMoreSpans = secondSpans.next();
- setCandidateList();
- }
- while (hasMoreSpans && !isSecondSpanValid());
- }
- return false;
+ if (!ensureSameDoc(e,y)){
+ return false;
+ };
+ }
+ return false;
}
- /** Determine if the current second span is valid. It is always valid in
- * TokenDistanceSpan, but it can be invalid in the ElementDistanceSpan,
- * namely when it is not within a particular element (a sentence or a
- * paragraph depends on the element distance unit).
- *
- * */
- protected abstract boolean isSecondSpanValid() throws IOException;
-
- /** Collect all possible firstspan instances as candidate spans for
- * the current secondspan. The candidate spans are within the max
- * distance from the current secondspan.
- * */
- protected abstract void setCandidateList() throws IOException;
-
- /** Define the conditions for a match.
- * */
- protected abstract boolean findMatch() throws IOException;
-
- /** Define the properties of a span match.
- * */
- protected void setMatchProperties(CandidateSpan candidateSpan,
- boolean isDistanceZero) throws IOException{
-
- if (isDistanceZero){
- matchStartPosition = Math.min(candidateSpan.getStart(), secondSpans.start());
- matchEndPosition = Math.max(candidateSpan.getEnd(), secondSpans.end());
- }
- else {
- matchStartPosition = candidateSpan.getStart();
- matchEndPosition = secondSpans.end();
- }
-
- this.matchDocNumber = secondSpans.doc();
- if (collectPayloads){
- if (candidateSpan.getPayloads() != null) {
- matchPayload.addAll(candidateSpan.getPayloads());
- }
- if (secondSpans.isPayloadAvailable()) {
- matchPayload.addAll(secondSpans.getPayload());
- }
- }
-
- log.trace("doc# {}, start {}, end {}",matchDocNumber,matchStartPosition,
- matchEndPosition);
+ public CandidateSpan getMatchFirstSpan() {
+ return matchFirstSpan;
}
- @Override
- public boolean skipTo(int target) throws IOException {
- if (hasMoreSpans && (secondSpans.doc() < target)){
- if (!secondSpans.skipTo(target)){
- candidateList.clear();
- return false;
- }
- }
-
- setCandidateList();
- matchPayload.clear();
- isStartEnumeration=false;
- return advance();
+ public void setMatchFirstSpan(CandidateSpan matchFirstSpan) {
+ this.matchFirstSpan = matchFirstSpan;
+ }
+
+ public CandidateSpan getMatchSecondSpan() {
+ return matchSecondSpan;
+ }
+
+ public void setMatchSecondSpan(CandidateSpan matchSecondSpan) {
+ this.matchSecondSpan = matchSecondSpan;
}
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/ElementDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/ElementDistanceSpans.java
index e5ec3f6..cd751ab 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/ElementDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/ElementDistanceSpans.java
@@ -22,7 +22,7 @@
*
* @author margaretha
* */
-public class ElementDistanceSpans extends DistanceSpans {
+public class ElementDistanceSpans extends OrderedDistanceSpans {
private Spans elements;
private boolean hasMoreElements;
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/OrderedDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/OrderedDistanceSpans.java
new file mode 100644
index 0000000..0d4e459
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/OrderedDistanceSpans.java
@@ -0,0 +1,130 @@
+package de.ids_mannheim.korap.query.spans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.util.Bits;
+
+import de.ids_mannheim.korap.query.SpanDistanceQuery;
+
+/** Base class for calculating a distance between two ordered spans.
+ * @author margaretha
+ * */
+public abstract class OrderedDistanceSpans extends DistanceSpans {
+
+ protected boolean hasMoreFirstSpans;
+ protected boolean collectPayloads;
+ protected int minDistance,maxDistance;
+
+ protected List<CandidateSpan> candidateList;
+ protected int candidateListIndex;
+ protected int candidateListDocNum;
+
+
+ public OrderedDistanceSpans(SpanDistanceQuery query,
+ AtomicReaderContext context, Bits acceptDocs,
+ Map<Term, TermContext> termContexts)
+ throws IOException {
+ super(query, context, acceptDocs, termContexts);
+
+ minDistance = query.getMinDistance();
+ maxDistance = query.getMaxDistance();
+ collectPayloads = query.isCollectPayloads();
+
+ hasMoreFirstSpans = firstSpans.next();
+
+ candidateList = new ArrayList<>();
+ candidateListIndex = -1;
+ candidateListDocNum = firstSpans.doc();
+ }
+
+ /** Find a span match in the candidate list.
+ * */
+ @Override
+ protected boolean advance() throws IOException {
+ while( hasMoreSpans && candidateListIndex < candidateList.size() ){
+ // Check candidates
+ for (candidateListIndex++;candidateListIndex < candidateList.size();
+ candidateListIndex++){
+ if (findMatch())
+ return true;
+ }
+
+ do { // Forward secondspan
+ hasMoreSpans = secondSpans.next();
+ setCandidateList();
+ }
+ while (hasMoreSpans && !isSecondSpanValid());
+ }
+ return false;
+ }
+
+ /** Determine if the current second span is valid. It is always valid in
+ * TokenDistanceSpan, but it can be invalid in the ElementDistanceSpan,
+ * namely when it is not within a particular element (a sentence or a
+ * paragraph depends on the element distance unit).
+ *
+ * */
+ protected abstract boolean isSecondSpanValid() throws IOException;
+
+ /** Collect all possible firstspan instances as candidate spans for
+ * the current secondspan. The candidate spans are within the max
+ * distance from the current secondspan.
+ * */
+ protected abstract void setCandidateList() throws IOException;
+
+ /** Define the conditions for a match.
+ * */
+ protected abstract boolean findMatch() throws IOException;
+
+ /** Define the properties of a span match.
+ * */
+ protected void setMatchProperties(CandidateSpan candidateSpan,
+ boolean isDistanceZero) throws IOException{
+
+ setMatchFirstSpan(candidateSpan);
+ setMatchSecondSpan(new CandidateSpan(secondSpans));
+
+ if (isDistanceZero){
+ matchStartPosition = Math.min(candidateSpan.getStart(), secondSpans.start());
+ matchEndPosition = Math.max(candidateSpan.getEnd(), secondSpans.end());
+ }
+ else {
+ matchStartPosition = candidateSpan.getStart();
+ matchEndPosition = secondSpans.end();
+ }
+
+ this.matchDocNumber = secondSpans.doc();
+ if (collectPayloads){
+ if (candidateSpan.getPayloads() != null) {
+ matchPayload.addAll(candidateSpan.getPayloads());
+ }
+ if (secondSpans.isPayloadAvailable()) {
+ matchPayload.addAll(secondSpans.getPayload());
+ }
+ }
+
+ log.trace("doc# {}, start {}, end {}",matchDocNumber,matchStartPosition,
+ matchEndPosition);
+ }
+
+ @Override
+ public boolean skipTo(int target) throws IOException {
+ if (hasMoreSpans && (secondSpans.doc() < target)){
+ if (!secondSpans.skipTo(target)){
+ candidateList.clear();
+ return false;
+ }
+ }
+
+ setCandidateList();
+ matchPayload.clear();
+ isStartEnumeration=false;
+ return advance();
+ }
+}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
index a7483aa..0bacd04 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
@@ -76,24 +76,6 @@
return true;
}
- /** Find the same doc shared by element, firstspan and secondspan.
- * @return true iff such a doc is found.
- * */
- protected boolean findSameDoc(Spans x,
- Spans y, Spans e) throws IOException{
-
- while (hasMoreSpans) {
- if (ensureSameDoc(x, y) &&
- e.doc() == x.doc()){
- return true;
- }
- if (!ensureSameDoc(e,y)){
- return false;
- };
- }
- return false;
- }
-
@Override
public int doc() {
return matchDocNumber;
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/TokenDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/TokenDistanceSpans.java
index 586f0e1..f3c4907 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/TokenDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/TokenDistanceSpans.java
@@ -18,7 +18,7 @@
*
* @author margaretha
* */
-public class TokenDistanceSpans extends DistanceSpans{
+public class TokenDistanceSpans extends OrderedDistanceSpans{
public TokenDistanceSpans(SpanDistanceQuery query,
AtomicReaderContext context, Bits acceptDocs,
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java
index 9b80f60..2c29093 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java
@@ -12,8 +12,6 @@
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import de.ids_mannheim.korap.query.SpanDistanceQuery;
@@ -22,7 +20,7 @@
*
* @author margaretha
* */
-public abstract class UnorderedDistanceSpans extends SimpleSpans{
+public abstract class UnorderedDistanceSpans extends DistanceSpans{
protected int minDistance, maxDistance;
private boolean collectPayloads;
@@ -31,10 +29,7 @@
protected List<CandidateSpan> firstSpanList, secondSpanList;
protected List<CandidateSpan> matchList;
private long matchCost;
-
- protected int updatedListNum;
-
- private Logger log = LoggerFactory.getLogger(UnorderedDistanceSpans.class);
+ private int matchListSpanNum;
public UnorderedDistanceSpans(SpanDistanceQuery query,
AtomicReaderContext context, Bits acceptDocs,
@@ -54,16 +49,7 @@
}
@Override
- public boolean next() throws IOException {
- isStartEnumeration = false;
- matchPayload.clear();
- return advance();
- }
-
- /** Find the next span match.
- * @return true iff a span match is available.
- * */
- private boolean advance() throws IOException {
+ protected boolean advance() throws IOException {
while (hasMoreSpans || !matchList.isEmpty()){
if (!matchList.isEmpty()){
setMatchProperties();
@@ -108,10 +94,14 @@
if (currentFirstSpan.getEnd() <= currentSecondSpan.getEnd()){
matchList = findMatches(currentFirstSpan, secondSpanList);
- updateList(firstSpanList);
+ setMatchFirstSpan(currentFirstSpan);
+ matchListSpanNum = 2;
+ updateList(firstSpanList);
}
else {
matchList = findMatches(currentSecondSpan, firstSpanList);
+ setMatchSecondSpan(currentSecondSpan);
+ matchListSpanNum = 1;
updateList(secondSpanList);
}
}
@@ -164,7 +154,6 @@
return new CandidateSpan(start,end,doc,cost,payloads);
}
-
/** Assign the first candidate span in the match list as the current span match.
* */
private void setMatchProperties() {
@@ -175,7 +164,11 @@
matchCost = cs.getCost();
matchPayload.addAll(cs.getPayloads());
matchList.remove(0);
-
+
+ if (matchListSpanNum == 1)
+ setMatchFirstSpan(cs);
+ else setMatchSecondSpan(cs);
+
log.trace("Match doc#={} start={} end={}", matchDocNumber,
matchStartPosition,matchEndPosition);
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedElementDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedElementDistanceSpans.java
index 2483720..21c22e3 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedElementDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedElementDistanceSpans.java
@@ -199,8 +199,7 @@
CandidateSpan e;
while(i.hasNext()){
e = i.next();
- if (e.getPosition() < position) {
- //System.out.println("pos "+position+" size "+ elementList.size());
+ if (e.getPosition() <= position) {
i.remove();
}
break;
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java
index 8d786d9..f7e6037 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java
@@ -2,7 +2,6 @@
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
import java.util.Map;
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestDistanceIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestDistanceIndex.java
index 44ce601..7200bf3 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestDistanceIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestDistanceIndex.java
@@ -296,8 +296,8 @@
ki.addDoc(createFieldDoc1());
ki.commit();
- SpanQuery firstClause = createQuery("s:c", "s:c", 1, 2,true);
- kr = ki.search(firstClause, (short) 10);
+ SpanQuery sq = createQuery("s:c", "s:c", 1, 2,true);
+ kr = ki.search(sq, (short) 10);
assertEquals(3, kr.totalResults());
assertEquals(0, kr.getMatch(0).startPos);
@@ -306,6 +306,21 @@
assertEquals(4, kr.getMatch(1).endPos);
assertEquals(3, kr.getMatch(2).startPos);
assertEquals(6, kr.getMatch(2).endPos);
- }
+
+ ki.addDoc(createFieldDoc2());
+ ki.commit();
+
+ // with order
+ sq = createQuery("s:e", "s:e", 1, 1,true);
+ kr = ki.search(sq, (short) 10);
+
+ assertEquals(1, kr.totalResults());
+
+ // without order
+ sq = createQuery("s:e", "s:e", 1, 1,false);
+ kr = ki.search(sq, (short) 10);
+
+ assertEquals(2, kr.totalResults());
+ }
}
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestElementDistanceIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestElementDistanceIndex.java
index c6d6277..c855fe5 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestElementDistanceIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestElementDistanceIndex.java
@@ -28,7 +28,7 @@
fd.addString("ID", "doc-0");
fd.addTV("base",
"text",
- "[(0-1)s:b|s:c|_1#0-1|<>:s#0-2$<i>1]" +
+ "[(0-1)s:b|s:c|_1#0-1|<>:s#0-1$<i>1]" +
"[(1-2)s:b|_2#1-2]" +
"[(2-3)s:c|_3#2-3|<>:s#2-3$<i>3]" +
"[(3-4)s:b|_4#3-4|<>:s#3-4$<i>4]" +