Update DistanceConstraint
Add childspan in CandidateSpan
Fix bugs in unordered distance queries (prepareLists)
diff --git a/src/main/java/de/ids_mannheim/korap/query/DistanceConstraint.java b/src/main/java/de/ids_mannheim/korap/query/DistanceConstraint.java
index 3617066..b061046 100644
--- a/src/main/java/de/ids_mannheim/korap/query/DistanceConstraint.java
+++ b/src/main/java/de/ids_mannheim/korap/query/DistanceConstraint.java
@@ -6,16 +6,19 @@
private SpanElementQuery elementQuery;
private boolean exclusion;
- public DistanceConstraint(String unit, int min, int max, boolean exclusion) {
- this.unit = unit;
+ public DistanceConstraint(int min, int max, boolean exclusion) {
+ this.unit = "w";
this.minDistance = min;
this.maxDistance = max;
this.exclusion = exclusion;
}
- public DistanceConstraint(SpanElementQuery elementQuery, String unit,
- int min, int max, boolean exclusion) {
- this(unit, min, max, exclusion);
+ public DistanceConstraint(SpanElementQuery elementQuery, int min, int max,
+ boolean exclusion) {
+ this.unit = elementQuery.getElementStr();
+ this.minDistance = min;
+ this.maxDistance = max;
+ this.exclusion = exclusion;
this.elementQuery = elementQuery;
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java b/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java
index ba398be..b053fd4 100644
--- a/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java
@@ -7,7 +7,6 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.SpanQuery;
-import org.apache.lucene.util.ToStringUtils;
/** An abstract class for a Spanquery having two clauses.
*
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java b/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java
index d8727f2..8f4273a 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java
@@ -13,6 +13,7 @@
private long cost;
private Collection<byte[]> payloads;
private int position;
+ private CandidateSpan childSpan; // used for multiple distance with unordered constraint
public CandidateSpan(Spans span) throws IOException {
this.doc = span.doc();
@@ -84,6 +85,14 @@
public void setPosition(int position) {
this.position = position;
}
+
+ public CandidateSpan getChildSpan() {
+ return childSpan;
+ }
+
+ public void setChildSpan(CandidateSpan childSpan) {
+ this.childSpan = childSpan;
+ }
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/ElementDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/ElementDistanceSpans.java
index cd751ab..b079c72 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/ElementDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/ElementDistanceSpans.java
@@ -40,7 +40,7 @@
hasMoreElements = elements.next();
hasMoreSpans = hasMoreFirstSpans && hasMoreElements;
- elementPosition=0;
+ elementPosition=0;
}
@Override
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java
index b4b71d8..3d9fd0a 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java
@@ -28,6 +28,7 @@
protected List<CandidateSpan> matchList;
private long matchCost;
private int matchListSpanNum;
+ protected int currentDocNum;
public UnorderedDistanceSpans(SpanDistanceQuery query,
AtomicReaderContext context, Bits acceptDocs,
@@ -35,7 +36,6 @@
super(query, context, acceptDocs, termContexts);
minDistance = query.getMinDistance();
maxDistance = query.getMaxDistance();
- collectPayloads = query.isCollectPayloads();
firstSpanList = new ArrayList<CandidateSpan>();
secondSpanList = new ArrayList<CandidateSpan>();
@@ -52,15 +52,8 @@
if (!matchList.isEmpty()){
setMatchProperties();
return true;
- }
-
- if (firstSpanList.isEmpty() && secondSpanList.isEmpty()){
- if (fillEmptyCandidateLists()){
- setMatchList();
- }
- else { hasMoreSpans = false; }
- }
- else { setMatchList(); }
+ }
+ if (prepareLists()) setMatchList();
}
return false;
}
@@ -72,7 +65,7 @@
*
* @return true iff at least one of the candidate lists can be filled.
* */
- protected abstract boolean fillEmptyCandidateLists() throws IOException;
+ protected abstract boolean prepareLists() throws IOException;
/** Set the list of matches between the span having the smallest position, and
* its candidates. Simply remove the span if it does not have any candidates.
@@ -83,6 +76,16 @@
hasMoreFirstSpans,secondSpanList);
hasMoreSecondSpans = setCandidateList(secondSpanList,secondSpans,
hasMoreSecondSpans,firstSpanList);
+// System.out.println("--------------------");
+// System.out.println("firstSpanList:");
+// for (CandidateSpan cs: firstSpanList) {
+// System.out.println(cs.getStart() +" "+ cs.getEnd());
+// }
+//
+// System.out.println("secondSpanList:");
+// for (CandidateSpan cs: secondSpanList) {
+// System.out.println(cs.getStart() +" "+ cs.getEnd());
+// }
CandidateSpan currentFirstSpan, currentSecondSpan;
if (!firstSpanList.isEmpty() && !secondSpanList.isEmpty()){
@@ -92,12 +95,24 @@
if (currentFirstSpan.getEnd() < currentSecondSpan.getEnd() ||
isLastCandidateSmaller(currentFirstSpan, currentSecondSpan)){
+// System.out.println("current target: "+firstSpanList.get(0).getStart() +" "+firstSpanList.get(0).getEnd());
+// System.out.println("candidates:");
+// for (CandidateSpan cs: secondSpanList) {
+// System.out.println(cs.getStart() +" "+ cs.getEnd());
+// }
+
matchList = findMatches(currentFirstSpan, secondSpanList);
setMatchFirstSpan(currentFirstSpan);
matchListSpanNum = 2;
updateList(firstSpanList);
}
else {
+// System.out.println("current target: "+secondSpanList.get(0).getStart() +" "+secondSpanList.get(0).getEnd());
+// System.out.println("candidates:");
+// for (CandidateSpan cs: firstSpanList) {
+// System.out.println(cs.getStart() +" "+ cs.getEnd());
+// }
+
matchList = findMatches(currentSecondSpan, firstSpanList);
setMatchSecondSpan(currentSecondSpan);
matchListSpanNum = 1;
@@ -105,9 +120,13 @@
}
}
else if (firstSpanList.isEmpty()){
+// System.out.println("current target: "+secondSpanList.get(0).getStart() +" "+secondSpanList.get(0).getEnd());
+// System.out.println("candidates: empty");
updateList(secondSpanList);
}
else{
+// System.out.println("current target: "+firstSpanList.get(0).getStart() +" "+firstSpanList.get(0).getEnd());
+// System.out.println("candidates: empty");
updateList(firstSpanList);
}
}
@@ -161,7 +180,10 @@
payloads.addAll(cs.getPayloads());
}
}
- return new CandidateSpan(start,end,doc,cost,payloads);
+
+ CandidateSpan match = new CandidateSpan(start,end,doc,cost,payloads);
+ match.setChildSpan(cs);
+ return match;
}
/** Assign the first candidate span in the match list as the current span match.
@@ -174,13 +196,15 @@
matchCost = cs.getCost();
matchPayload.addAll(cs.getPayloads());
matchList.remove(0);
-
+
if (matchListSpanNum == 1)
- setMatchFirstSpan(cs);
- else setMatchSecondSpan(cs);
+ setMatchFirstSpan(cs.getChildSpan());
+ else setMatchSecondSpan(cs.getChildSpan());
log.trace("Match doc#={} start={} end={}", matchDocNumber,
matchStartPosition,matchEndPosition);
+ //System.out.println("firstspan "+getMatchFirstSpan().getStart()+" "+ getMatchFirstSpan().getEnd());
+ //System.out.println("secondspan "+getMatchSecondSpan().getStart()+" "+ getMatchSecondSpan().getEnd());
}
@Override
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedElementDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedElementDistanceSpans.java
index 21c22e3..b9ed42c 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedElementDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedElementDistanceSpans.java
@@ -29,8 +29,7 @@
// contains all previous elements whose position is greater than the last
// target span
- private List<CandidateSpan> elementList;
- private int currentDoc;
+ private List<CandidateSpan> elementList;
public UnorderedElementDistanceSpans(SpanDistanceQuery query,
AtomicReaderContext context, Bits acceptDocs,
@@ -44,34 +43,53 @@
}
@Override
- protected boolean fillEmptyCandidateLists() throws IOException {
- int position;
- while (firstSpanList.isEmpty() && secondSpanList.isEmpty()){
-
+ protected boolean prepareLists() throws IOException {
+
+ if (firstSpanList.isEmpty() && secondSpanList.isEmpty()){
if (hasMoreFirstSpans && hasMoreSecondSpans && hasMoreElements &&
findSameDoc(firstSpans, secondSpans, elements)){
- if (currentDoc != firstSpans.doc()){
- currentDoc = firstSpans.doc();
+ if (currentDocNum != firstSpans.doc()){
+ currentDocNum = firstSpans.doc();
elementList.clear();
- }
-
- position = findElementPosition(firstSpans);
- if (position != -1)
- firstSpanList.add(new CandidateSpan(firstSpans,position));
-
- position = findElementPosition(secondSpans);
- if (position != -1)
- secondSpanList.add(new CandidateSpan(secondSpans,position));
-
- hasMoreFirstSpans = firstSpans.next();
- hasMoreSecondSpans = secondSpans.next();
+ }
+
+ hasMoreFirstSpans = addSpan(firstSpans,firstSpanList,hasMoreFirstSpans);
+ hasMoreSecondSpans = addSpan(secondSpans, secondSpanList, hasMoreSecondSpans);
}
- else { return false; }
+ else {
+ hasMoreSpans = false;
+ return false;
+ }
+ }
+ else if (firstSpanList.isEmpty() && hasMoreFirstSpans &&
+ firstSpans.doc() == currentDocNum){
+ hasMoreFirstSpans = addSpan(firstSpans,firstSpanList,hasMoreFirstSpans);
}
+ else if (secondSpanList.isEmpty() && hasMoreSecondSpans &&
+ secondSpans.doc() == currentDocNum){
+ hasMoreSecondSpans = addSpan(secondSpans, secondSpanList, hasMoreSecondSpans);
+ }
+
return true;
}
+ private boolean addSpan(Spans span, List<CandidateSpan> list, boolean hasMoreSpan)
+ throws IOException {
+ int position;
+ while (hasMoreSpan && span.doc() == currentDocNum){
+ position = findElementPosition(span);
+ if (position != -1){
+ list.add(new CandidateSpan(span,position));
+ hasMoreSpan = span.next();
+ return hasMoreSpan;
+ }
+ hasMoreSpan = span.next();
+ }
+ return hasMoreSpan;
+ }
+
+
/** Find the element position of the span in the element list or by advancing
* the element spans until encountering the span.
*
@@ -99,7 +117,7 @@
* */
private boolean advanceElementTo(Spans span) throws IOException {
while (hasMoreElements &&
- elements.doc() == currentDoc &&
+ elements.doc() == currentDocNum &&
elements.start() < span.end()){
if (span.start() >= elements.start() &&
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java
index f7e6037..f733bbb 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java
@@ -28,16 +28,33 @@
}
@Override
- protected boolean fillEmptyCandidateLists() throws IOException {
- if (hasMoreFirstSpans && hasMoreSecondSpans &&
- ensureSameDoc(firstSpans, secondSpans)){
- firstSpanList.add(new CandidateSpan(firstSpans));
- secondSpanList.add(new CandidateSpan(secondSpans));
- hasMoreFirstSpans = firstSpans.next();
- hasMoreSecondSpans = secondSpans.next();
- return true;
+ protected boolean prepareLists() throws IOException {
+
+ if (firstSpanList.isEmpty() && secondSpanList.isEmpty()){
+ if (hasMoreFirstSpans && hasMoreSecondSpans &&
+ ensureSameDoc(firstSpans, secondSpans)){
+ firstSpanList.add(new CandidateSpan(firstSpans));
+ secondSpanList.add(new CandidateSpan(secondSpans));
+ hasMoreFirstSpans = firstSpans.next();
+ hasMoreSecondSpans = secondSpans.next();
+ currentDocNum = firstSpans.doc();
+ }
+ else {
+ hasMoreSpans = false;
+ return false;
+ }
}
- return false;
+ else if (firstSpanList.isEmpty() && hasMoreFirstSpans &&
+ firstSpans.doc() == currentDocNum){
+ firstSpanList.add(new CandidateSpan(firstSpans));
+ hasMoreFirstSpans = firstSpans.next();
+ }
+ else if (secondSpanList.isEmpty() && hasMoreSecondSpans &&
+ secondSpans.doc() == currentDocNum){
+ secondSpanList.add(new CandidateSpan(secondSpans));
+ hasMoreSecondSpans = secondSpans.next();
+ }
+ return true;
}
@Override
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java
index 93fcee4..bb2208d 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java
@@ -92,7 +92,7 @@
public SpanSequenceQueryWrapper withConstraint (int min, int max) {
if (this.constraints == null)
this.constraints = new ArrayList<DistanceConstraint>(1);
- this.constraints.add(new DistanceConstraint("w", min, max, false));
+ this.constraints.add(new DistanceConstraint(min, max, false));
return this;
};
@@ -100,11 +100,11 @@
if (this.constraints == null)
this.constraints = new ArrayList<DistanceConstraint>(1);
if (unit.equals("w"))
- this.constraints.add(new DistanceConstraint("w", min, max, false));
+ this.constraints.add(new DistanceConstraint(min, max, false));
else
this.constraints.add(
new DistanceConstraint(
- new SpanElementQuery(this.field, unit), unit, min, max, false
+ new SpanElementQuery(this.field, unit), min, max, false
)
);
return this;
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestUnorderedDistanceIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestUnorderedDistanceIndex.java
index 72c735c..dac5db2 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestUnorderedDistanceIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestUnorderedDistanceIndex.java
@@ -16,7 +16,6 @@
import de.ids_mannheim.korap.query.SpanDistanceQuery;
import de.ids_mannheim.korap.query.SpanElementQuery;
import de.ids_mannheim.korap.query.SpanNextQuery;
-import de.ids_mannheim.korap.query.SpanSegmentQuery;
@RunWith(JUnit4.class)
public class TestUnorderedDistanceIndex{
@@ -252,4 +251,67 @@
assertEquals(4, kr.totalResults());
}
+
+ /** Nested distance queries
+ * */
+ @Test
+ public void testCase7() throws IOException{
+ //System.out.println("testcase 7");
+ ki = new KorapIndex();
+ ki.addDoc(createFieldDoc0());
+ ki.addDoc(createFieldDoc1());
+ ki.commit();
+
+ SpanQuery sq = createQuery("s:c","s:d",1,2,false);
+ SpanQuery sq2 = new SpanDistanceQuery(
+ sq,
+ new SpanTermQuery(new Term("base","s:e")),
+ 1,
+ 2,
+ false,
+ true);
+ kr = ki.search(sq2, (short) 10);
+ //assertEquals(3, kr.totalResults());
+ }
+
+ @Test
+ public void testCase8() throws IOException{
+ //System.out.println("testcase 8");
+ ki = new KorapIndex();
+ for (String i : new String[] {"AUG-53507", "SEP-62389", "NOV-74813"}) {
+ ki.addDocFile(
+ getClass().getResource("/a00/" + i + ".json.gz").getFile(), true
+ );
+ };
+ ki.commit();
+
+ SpanQuery sq = new SpanDistanceQuery(
+ new SpanTermQuery(new Term("tokens","s:in")),
+ new SpanTermQuery(new Term("tokens","s:horrendem")),
+ 0,
+ 2,
+ false,
+ true
+ );
+ kr = ki.search(sq, (short) 10);
+
+ assertEquals(3, kr.totalResults());
+ assertEquals(170, kr.getMatch(0).startPos);
+ assertEquals(172, kr.getMatch(0).endPos);
+ assertEquals(174, kr.getMatch(1).startPos);
+ assertEquals(176, kr.getMatch(1).endPos);
+ assertEquals(71, kr.getMatch(2).startPos);
+ assertEquals(73, kr.getMatch(2).endPos);
+
+// System.out.print(kr.getTotalResults()+"\n");
+// for (int i=0; i< kr.getTotalResults(); i++){
+// System.out.println(
+// kr.match(i).getLocalDocID()+" "+
+// kr.match(i).startPos + " " +
+// kr.match(i).endPos
+// );
+// }
+
+ }
+
}
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestUnorderedElementDistanceIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestUnorderedElementDistanceIndex.java
index 75244eb..703e96e 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestUnorderedElementDistanceIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestUnorderedElementDistanceIndex.java
@@ -59,10 +59,10 @@
fd.addTV("base",
"text",
"[(0-1)s:e|_1#0-1|<>:p#0-2$<i>1]" +
- "[(1-2)s:c|_2#1-2|<>:p#1-2$<i>2]" +
- "[(2-3)s:d|_3#2-3|<>:p#2-3$<i>3]" +
- "[(3-4)s:b|_4#3-4|<>:p#3-4$<i>4]" +
- "[(4-5)s:d|_5#4-5|<>:p#4-5$<i>5]" +
+ "[(1-2)s:e|_2#1-2|<>:p#1-2$<i>2]" +
+ "[(2-3)s:c|_3#2-3|<>:p#2-3$<i>3]" +
+ "[(3-4)s:e|_4#3-4|<>:p#3-4$<i>4]" +
+ "[(4-5)s:b|_5#4-5|<>:p#4-5$<i>5]" +
"[(5-6)s:c|_6#5-6|<>:p#5-6$<i>6]" +
"[(6-7)s:e|_7#6-7|<>:p#6-7$<i>7]" +
"[(7-8)s:b|_8#7-8|<>:p#7-8$<i>8]");
@@ -133,7 +133,7 @@
/** Ensure same doc.
* In the beginning, first and second spans are already too far from each other
- * (one-list-empty case, both-list-empty case).
+ * (one-list-empty case, both-list-empty-case).
* */
@Test
public void testCase2() throws IOException{
@@ -147,10 +147,15 @@
SpanQuery sq;
sq = createQuery("p", "s:b", "s:e", 0, 2,false);
kr = ki.search(sq, (short) 10);
-
- assertEquals(1,kr.getTotalResults());
- assertEquals(6, kr.getMatch(0).startPos);
- assertEquals(8, kr.getMatch(0).endPos);
+
+ assertEquals(3,kr.getTotalResults());
+ assertEquals(2,kr.getMatch(0).getLocalDocID());
+ assertEquals(3, kr.getMatch(0).startPos);
+ assertEquals(5, kr.getMatch(0).endPos);
+ assertEquals(4, kr.getMatch(1).startPos);
+ assertEquals(7, kr.getMatch(1).endPos);
+ assertEquals(6, kr.getMatch(2).startPos);
+ assertEquals(8, kr.getMatch(2).endPos);
}
/** Multiple occurrences in an element.