Quantifier query,
Update next spans for multiple matches in the same position,
Test queries with the WPD corpus
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java b/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java
index 8f4273a..41ba103 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java
@@ -1,6 +1,7 @@
package de.ids_mannheim.korap.query.spans;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Collection;
import org.apache.lucene.search.spans.Spans;
@@ -11,7 +12,7 @@
public class CandidateSpan {
private int doc,start,end;
private long cost;
- private Collection<byte[]> payloads;
+ private Collection<byte[]> payloads = new ArrayList<>();
private int position;
private CandidateSpan childSpan; // used for multiple distance with unordered constraint
@@ -19,14 +20,9 @@
this.doc = span.doc();
this.start = span.start();
this.end = span.end();
- this.cost = span.cost();
-
- if (span.isPayloadAvailable()){
- this.payloads = span.getPayload();
- }
- else{
- this.payloads = null;
- }
+ this.cost = span.cost();
+ if (span.isPayloadAvailable())
+ setPayloads(span.getPayload());
}
public CandidateSpan(Spans span, int position) throws IOException {
@@ -40,7 +36,7 @@
this.end = end;
this.doc = doc;
this.cost = cost;
- this.payloads = payloads;
+ if (payloads != null) setPayloads(payloads);
}
public int getDoc() {
@@ -67,7 +63,9 @@
}
public void setPayloads(Collection<byte[]> payloads) {
- this.payloads = payloads;
+ for (byte[] b : payloads){
+ this.payloads.add(b.clone());
+ }
}
public long getCost() {
@@ -93,6 +91,4 @@
public void setChildSpan(CandidateSpan childSpan) {
this.childSpan = childSpan;
}
-
-
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
index 6d189a5..02dfdcf 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
@@ -1,47 +1,156 @@
package de.ids_mannheim.korap.query.spans;
import java.io.IOException;
-
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
import java.util.Map;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.util.Bits;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
-
-import de.ids_mannheim.korap.query.SimpleSpanQuery;
+import de.ids_mannheim.korap.query.SpanNextQuery;
/** NextSpans is an enumeration of Span matches, which ensures that
* a span is immediately followed by another span.
*
- * TODO: nextSpans needs collectPayloads to be explicitly set true. Why?
- * ndiewald: They don't have to be set explicitely - just make them use it always
+ * Update: allow multiple matches at the same firstspan position
+ *
* @author margaretha
* */
-public class NextSpans extends NonPartialOverlappingSpans {
+public class NextSpans extends SimpleSpans {
- public NextSpans (SimpleSpanQuery simpleSpanQuery,
+ private List<CandidateSpan> matchList;
+ private List<CandidateSpan> candidateList;
+ private int candidateListDocNum;
+ private boolean hasMoreFirstSpan;
+
+ private Logger log = LoggerFactory.getLogger(NextSpans.class);
+
+ public NextSpans (SpanNextQuery spanNextQuery,
AtomicReaderContext context,
Bits acceptDocs,
Map<Term,TermContext> termContexts) throws IOException {
- super(simpleSpanQuery, context, acceptDocs, termContexts);
- };
+ super(spanNextQuery, context, acceptDocs, termContexts);
+ collectPayloads = spanNextQuery.isCollectPayloads();
+ hasMoreSpans = secondSpans.next();
+ matchList = new ArrayList<>();
+ candidateListDocNum = firstSpans.doc();
+ candidateList = new ArrayList<>();
+ }
- /** Check weather the end position of the current firstspan equals
- * the start position of the secondspan.
- **/
- @Override
- protected int findMatch() {
- if (firstSpans.end() == secondSpans.start()) {
- matchDocNumber = firstSpans.doc();
- matchStartPosition = firstSpans.start();
- matchEndPosition = secondSpans.end();
- return 0;
- }
- else if (firstSpans.end() > secondSpans.start())
- return 1;
+ @Override
+ public boolean next() throws IOException {
+ isStartEnumeration=false;
+ matchPayload.clear();
+ return advance();
+ }
- return -1;
- };
+ private boolean advance() throws IOException {
+
+ while (hasMoreSpans || !matchList.isEmpty() || !candidateList.isEmpty()){
+ if (!matchList.isEmpty()){
+ matchDocNumber = firstSpans.doc();
+ matchStartPosition = firstSpans.start();
+ matchEndPosition = matchList.get(0).getEnd();
+ if (collectPayloads)
+ matchPayload.addAll( matchList.get(0).getPayloads() );
+
+ log.trace("Match doc#: {}",matchDocNumber);
+ log.trace("Match positions: {}-{}", matchStartPosition,
+ matchEndPosition);
+ matchList.remove(0);
+ return true;
+ }
+ // Forward firstspan
+ hasMoreFirstSpan = firstSpans.next();
+ if (hasMoreFirstSpan) setMatchList();
+ else {
+ hasMoreSpans = false;
+ candidateList.clear(); }
+ }
+ return false;
+ }
+
+ private void setMatchList() throws IOException {
+ if (firstSpans.doc() == candidateListDocNum){
+ searchCandidates();
+ searchMatches();
+ }
+ else{
+ candidateList.clear();
+ if (hasMoreSpans && ensureSameDoc(firstSpans,secondSpans)){
+ candidateListDocNum = firstSpans.doc();
+ searchMatches();
+ }
+ }
+ }
+
+ private void searchCandidates() throws IOException {
+ Iterator<CandidateSpan> i = candidateList.iterator();
+ CandidateSpan cs;
+ while(i.hasNext()){
+ cs = i.next();
+ if (cs.getStart() == firstSpans.end()){
+ addMatch(cs);
+ }
+ else{
+ //System.out.println(cs.getStart() + " " +firstSpans.end());
+ i.remove();
+ }
+ }
+ }
+
+
+ private void searchMatches() throws IOException {
+
+ while (hasMoreSpans && candidateListDocNum == secondSpans.doc()){
+ if (secondSpans.start() > firstSpans.end()){
+ break;
+ }
+ if (secondSpans.start() == firstSpans.end()){
+ candidateList.add(new CandidateSpan(secondSpans));
+ addMatch(new CandidateSpan(secondSpans));
+ }
+ hasMoreSpans = secondSpans.next();
+ }
+ }
+
+ private void addMatch(CandidateSpan cs) throws IOException{
+
+ int start = firstSpans.start();
+ long cost = firstSpans.cost() + cs.getCost();
+
+ List<byte[]> payloads = new ArrayList<byte[]>();
+ if (collectPayloads) {
+ if (firstSpans.isPayloadAvailable())
+ payloads.addAll(firstSpans.getPayload());
+ if (cs.getPayloads() != null)
+ payloads.addAll(cs.getPayloads());
+ }
+
+ matchList.add(new CandidateSpan(start,cs.getEnd(),candidateListDocNum,cost,
+ payloads));
+ }
+
+ @Override
+ public boolean skipTo(int target) throws IOException {
+ if (hasMoreSpans && (firstSpans.doc() < target)){
+ if (!firstSpans.skipTo(target)){
+ hasMoreSpans = false;
+ return false;
+ }
+ }
+ matchPayload.clear();
+ return advance();
+ }
+
+ @Override
+ public long cost() {
+ return firstSpans.cost() + secondSpans.cost();
+ }
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/OrderedDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/OrderedDistanceSpans.java
index 732eb42..1fb9474 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/OrderedDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/OrderedDistanceSpans.java
@@ -99,8 +99,8 @@
this.matchDocNumber = secondSpans.doc();
if (collectPayloads){
- if (candidateSpan.getPayloads() != null) {
- matchPayload.addAll(candidateSpan.getPayloads());
+ if (candidateSpan.getPayloads() != null) {
+ matchPayload.addAll(candidateSpan.getPayloads());
}
if (secondSpans.isPayloadAvailable()) {
matchPayload.addAll(secondSpans.getPayload());
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/QuantifierSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/QuantifierSpans.java
new file mode 100644
index 0000000..8c99ac0
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/QuantifierSpans.java
@@ -0,0 +1,126 @@
+package de.ids_mannheim.korap.query.spans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.util.Bits;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import de.ids_mannheim.korap.query.SpanQuantifierQuery;
+
+public class QuantifierSpans extends SimpleSpans{
+
+ private int min,max;
+ private long matchCost;
+ private List<CandidateSpan> matchList;
+ private Logger log = LoggerFactory.getLogger(QuantifierSpans.class);
+
+ public QuantifierSpans(SpanQuantifierQuery query,
+ AtomicReaderContext context, Bits acceptDocs,
+ Map<Term, TermContext> termContexts)
+ throws IOException {
+ super(query, context, acceptDocs, termContexts);
+ this.min = query.getMin();
+ this.max = query.getMax();
+ matchList = new ArrayList<CandidateSpan>();
+ hasMoreSpans = firstSpans.next();
+ }
+
+ @Override
+ public boolean next() throws IOException {
+ isStartEnumeration = false;
+ return advance();
+ }
+
+ private boolean advance() throws IOException {
+
+ while (hasMoreSpans || !matchList.isEmpty()){
+ if (!matchList.isEmpty()){
+ setMatchProperties(matchList.get(0));
+ matchList.remove(0);
+ return true;
+ }
+ matchPayload.clear();
+ matchCost = 0;
+ setMatchList();
+ }
+
+ return false;
+ }
+
+ private void setMatchList() throws IOException {
+
+ CandidateSpan startSpan = new CandidateSpan(firstSpans);
+ if (min == 1 ) matchList.add(startSpan);
+
+ if (max == 1) {
+ hasMoreSpans = firstSpans.next();
+ }
+ else {
+ CandidateSpan prevSpan = startSpan;
+ Collection<byte[]> payload;
+ int n = 2;
+ while (n <= max &&
+ (hasMoreSpans = firstSpans.next()) &&
+ startSpan.getDoc() == firstSpans.doc() ){
+ if (firstSpans.start() > prevSpan.getEnd()){
+ break;
+ }
+ else if (min <= n){
+ if (firstSpans.isPayloadAvailable()){
+ payload = firstSpans.getPayload();
+ } else {payload = null;}
+
+ matchCost += firstSpans.cost();
+ matchList.add(new CandidateSpan(
+ startSpan.getStart(),
+ firstSpans.end(),
+ firstSpans.doc(),
+ matchCost,
+ payload)
+ );
+ }
+ prevSpan = new CandidateSpan(firstSpans);
+ n++;
+ }
+ }
+ }
+
+
+ private void setMatchProperties(CandidateSpan candidateSpan)
+ throws IOException {
+ matchDocNumber = candidateSpan.getDoc();
+ matchStartPosition = candidateSpan.getStart();
+ matchEndPosition = candidateSpan.getEnd();
+ if (collectPayloads && candidateSpan.getPayloads() != null) {
+ matchPayload.addAll(candidateSpan.getPayloads());
+ }
+ log.trace("doc# {}, start {}, end {}",matchDocNumber,matchStartPosition,
+ matchEndPosition);
+ }
+
+ @Override
+ public boolean skipTo(int target) throws IOException {
+ if (hasMoreSpans && firstSpans.doc() < target){
+ if (!firstSpans.skipTo(target)){
+ hasMoreSpans = false;
+ return false;
+ }
+ }
+ matchList.clear();
+ return advance();
+ }
+
+ @Override
+ public long cost() {
+ return matchCost;
+ }
+
+}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
index 17bb6ba..350d00e 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
@@ -48,8 +48,9 @@
// Get the enumeration of the two spans to match
firstSpans = simpleSpanQuery.getFirstClause().
getSpans(context, acceptDocs, termContexts);
- secondSpans = simpleSpanQuery.getSecondClause().
- getSpans(context, acceptDocs, termContexts);
+ if (simpleSpanQuery.getSecondClause() != null)
+ secondSpans = simpleSpanQuery.getSecondClause().
+ getSpans(context, acceptDocs, termContexts);
isStartEnumeration=true;
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java
index 3d9fd0a..843fe9d 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java
@@ -95,7 +95,7 @@
if (currentFirstSpan.getEnd() < currentSecondSpan.getEnd() ||
isLastCandidateSmaller(currentFirstSpan, currentSecondSpan)){
-// System.out.println("current target: "+firstSpanList.get(0).getStart() +" "+firstSpanList.get(0).getEnd());
+ log.trace("current target: "+firstSpanList.get(0).getStart() +" "+firstSpanList.get(0).getEnd());
// System.out.println("candidates:");
// for (CandidateSpan cs: secondSpanList) {
// System.out.println(cs.getStart() +" "+ cs.getEnd());
@@ -107,7 +107,7 @@
updateList(firstSpanList);
}
else {
-// System.out.println("current target: "+secondSpanList.get(0).getStart() +" "+secondSpanList.get(0).getEnd());
+ log.trace("current target: "+secondSpanList.get(0).getStart() +" "+secondSpanList.get(0).getEnd());
// System.out.println("candidates:");
// for (CandidateSpan cs: firstSpanList) {
// System.out.println(cs.getStart() +" "+ cs.getEnd());
@@ -116,17 +116,17 @@
matchList = findMatches(currentSecondSpan, firstSpanList);
setMatchSecondSpan(currentSecondSpan);
matchListSpanNum = 1;
- updateList(secondSpanList);
+ updateList(secondSpanList);
}
}
else if (firstSpanList.isEmpty()){
-// System.out.println("current target: "+secondSpanList.get(0).getStart() +" "+secondSpanList.get(0).getEnd());
-// System.out.println("candidates: empty");
+ log.trace("current target: "+secondSpanList.get(0).getStart() +" "+secondSpanList.get(0).getEnd());
+ log.trace("candidates: empty");
updateList(secondSpanList);
}
else{
-// System.out.println("current target: "+firstSpanList.get(0).getStart() +" "+firstSpanList.get(0).getEnd());
-// System.out.println("candidates: empty");
+ log.trace("current target: "+firstSpanList.get(0).getStart() +" "+firstSpanList.get(0).getEnd());
+ log.trace("candidates: empty");
updateList(firstSpanList);
}
}
@@ -201,10 +201,9 @@
setMatchFirstSpan(cs.getChildSpan());
else setMatchSecondSpan(cs.getChildSpan());
- log.trace("Match doc#={} start={} end={}", matchDocNumber,
- matchStartPosition,matchEndPosition);
- //System.out.println("firstspan "+getMatchFirstSpan().getStart()+" "+ getMatchFirstSpan().getEnd());
- //System.out.println("secondspan "+getMatchSecondSpan().getStart()+" "+ getMatchSecondSpan().getEnd());
+ log.trace("Match doc#={} start={} end={}",matchDocNumber,matchStartPosition,matchEndPosition);
+ log.trace("firstspan "+getMatchFirstSpan().getStart()+" "+ getMatchFirstSpan().getEnd());
+ log.trace("secondspan "+getMatchSecondSpan().getStart()+" "+ getMatchSecondSpan().getEnd());
}
@Override
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java
index f733bbb..5f1abd9 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java
@@ -35,9 +35,9 @@
ensureSameDoc(firstSpans, secondSpans)){
firstSpanList.add(new CandidateSpan(firstSpans));
secondSpanList.add(new CandidateSpan(secondSpans));
+ currentDocNum = firstSpans.doc();
hasMoreFirstSpans = firstSpans.next();
- hasMoreSecondSpans = secondSpans.next();
- currentDocNum = firstSpans.doc();
+ hasMoreSecondSpans = secondSpans.next();
}
else {
hasMoreSpans = false;