Quantifier query,
Update next spans for multiple matches in the same position,
Test queries with the WPD corpus
diff --git a/pom.xml b/pom.xml
index 8203bdf..31f2f8f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -150,6 +150,14 @@
<!--
<finalName>KorapTools</finalName>
-->
+
+ <archive>
+ <manifest>
+ <addClasspath>true</addClasspath>
+ <mainClass>de.ids_mannheim.korap.KorapIndexer</mainClass>
+ </manifest>
+ </archive>
+
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
@@ -183,6 +191,7 @@
<excludes>
<exclude>**/TestRealIndex.java</exclude>
<exclude>**/benchmark/*</exclude>
+ <exclude>**/TestWPDIndex.java/*</exclude>
</excludes>
</configuration>
</plugin>
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java b/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java
index 8f4273a..41ba103 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpan.java
@@ -1,6 +1,7 @@
package de.ids_mannheim.korap.query.spans;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Collection;
import org.apache.lucene.search.spans.Spans;
@@ -11,7 +12,7 @@
public class CandidateSpan {
private int doc,start,end;
private long cost;
- private Collection<byte[]> payloads;
+ private Collection<byte[]> payloads = new ArrayList<>();
private int position;
private CandidateSpan childSpan; // used for multiple distance with unordered constraint
@@ -19,14 +20,9 @@
this.doc = span.doc();
this.start = span.start();
this.end = span.end();
- this.cost = span.cost();
-
- if (span.isPayloadAvailable()){
- this.payloads = span.getPayload();
- }
- else{
- this.payloads = null;
- }
+ this.cost = span.cost();
+ if (span.isPayloadAvailable())
+ setPayloads(span.getPayload());
}
public CandidateSpan(Spans span, int position) throws IOException {
@@ -40,7 +36,7 @@
this.end = end;
this.doc = doc;
this.cost = cost;
- this.payloads = payloads;
+ if (payloads != null) setPayloads(payloads);
}
public int getDoc() {
@@ -67,7 +63,9 @@
}
public void setPayloads(Collection<byte[]> payloads) {
- this.payloads = payloads;
+ for (byte[] b : payloads){
+ this.payloads.add(b.clone());
+ }
}
public long getCost() {
@@ -93,6 +91,4 @@
public void setChildSpan(CandidateSpan childSpan) {
this.childSpan = childSpan;
}
-
-
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
index 6d189a5..02dfdcf 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
@@ -1,47 +1,156 @@
package de.ids_mannheim.korap.query.spans;
import java.io.IOException;
-
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
import java.util.Map;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.util.Bits;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
-
-import de.ids_mannheim.korap.query.SimpleSpanQuery;
+import de.ids_mannheim.korap.query.SpanNextQuery;
/** NextSpans is an enumeration of Span matches, which ensures that
* a span is immediately followed by another span.
*
- * TODO: nextSpans needs collectPayloads to be explicitly set true. Why?
- * ndiewald: They don't have to be set explicitely - just make them use it always
+ * Update: allow multiple matches at the same firstspan position
+ *
* @author margaretha
* */
-public class NextSpans extends NonPartialOverlappingSpans {
+public class NextSpans extends SimpleSpans {
- public NextSpans (SimpleSpanQuery simpleSpanQuery,
+ private List<CandidateSpan> matchList;
+ private List<CandidateSpan> candidateList;
+ private int candidateListDocNum;
+ private boolean hasMoreFirstSpan;
+
+ private Logger log = LoggerFactory.getLogger(NextSpans.class);
+
+ public NextSpans (SpanNextQuery spanNextQuery,
AtomicReaderContext context,
Bits acceptDocs,
Map<Term,TermContext> termContexts) throws IOException {
- super(simpleSpanQuery, context, acceptDocs, termContexts);
- };
+ super(spanNextQuery, context, acceptDocs, termContexts);
+ collectPayloads = spanNextQuery.isCollectPayloads();
+ hasMoreSpans = secondSpans.next();
+ matchList = new ArrayList<>();
+ candidateListDocNum = firstSpans.doc();
+ candidateList = new ArrayList<>();
+ }
- /** Check weather the end position of the current firstspan equals
- * the start position of the secondspan.
- **/
- @Override
- protected int findMatch() {
- if (firstSpans.end() == secondSpans.start()) {
- matchDocNumber = firstSpans.doc();
- matchStartPosition = firstSpans.start();
- matchEndPosition = secondSpans.end();
- return 0;
- }
- else if (firstSpans.end() > secondSpans.start())
- return 1;
+ @Override
+ public boolean next() throws IOException {
+ isStartEnumeration=false;
+ matchPayload.clear();
+ return advance();
+ }
- return -1;
- };
+ private boolean advance() throws IOException {
+
+ while (hasMoreSpans || !matchList.isEmpty() || !candidateList.isEmpty()){
+ if (!matchList.isEmpty()){
+ matchDocNumber = firstSpans.doc();
+ matchStartPosition = firstSpans.start();
+ matchEndPosition = matchList.get(0).getEnd();
+ if (collectPayloads)
+ matchPayload.addAll( matchList.get(0).getPayloads() );
+
+ log.trace("Match doc#: {}",matchDocNumber);
+ log.trace("Match positions: {}-{}", matchStartPosition,
+ matchEndPosition);
+ matchList.remove(0);
+ return true;
+ }
+ // Forward firstspan
+ hasMoreFirstSpan = firstSpans.next();
+ if (hasMoreFirstSpan) setMatchList();
+ else {
+ hasMoreSpans = false;
+ candidateList.clear(); }
+ }
+ return false;
+ }
+
+ private void setMatchList() throws IOException {
+ if (firstSpans.doc() == candidateListDocNum){
+ searchCandidates();
+ searchMatches();
+ }
+ else{
+ candidateList.clear();
+ if (hasMoreSpans && ensureSameDoc(firstSpans,secondSpans)){
+ candidateListDocNum = firstSpans.doc();
+ searchMatches();
+ }
+ }
+ }
+
+ private void searchCandidates() throws IOException {
+ Iterator<CandidateSpan> i = candidateList.iterator();
+ CandidateSpan cs;
+ while(i.hasNext()){
+ cs = i.next();
+ if (cs.getStart() == firstSpans.end()){
+ addMatch(cs);
+ }
+ else{
+ //System.out.println(cs.getStart() + " " +firstSpans.end());
+ i.remove();
+ }
+ }
+ }
+
+
+ private void searchMatches() throws IOException {
+
+ while (hasMoreSpans && candidateListDocNum == secondSpans.doc()){
+ if (secondSpans.start() > firstSpans.end()){
+ break;
+ }
+ if (secondSpans.start() == firstSpans.end()){
+ candidateList.add(new CandidateSpan(secondSpans));
+ addMatch(new CandidateSpan(secondSpans));
+ }
+ hasMoreSpans = secondSpans.next();
+ }
+ }
+
+ private void addMatch(CandidateSpan cs) throws IOException{
+
+ int start = firstSpans.start();
+ long cost = firstSpans.cost() + cs.getCost();
+
+ List<byte[]> payloads = new ArrayList<byte[]>();
+ if (collectPayloads) {
+ if (firstSpans.isPayloadAvailable())
+ payloads.addAll(firstSpans.getPayload());
+ if (cs.getPayloads() != null)
+ payloads.addAll(cs.getPayloads());
+ }
+
+ matchList.add(new CandidateSpan(start,cs.getEnd(),candidateListDocNum,cost,
+ payloads));
+ }
+
+ @Override
+ public boolean skipTo(int target) throws IOException {
+ if (hasMoreSpans && (firstSpans.doc() < target)){
+ if (!firstSpans.skipTo(target)){
+ hasMoreSpans = false;
+ return false;
+ }
+ }
+ matchPayload.clear();
+ return advance();
+ }
+
+ @Override
+ public long cost() {
+ return firstSpans.cost() + secondSpans.cost();
+ }
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/OrderedDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/OrderedDistanceSpans.java
index 732eb42..1fb9474 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/OrderedDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/OrderedDistanceSpans.java
@@ -99,8 +99,8 @@
this.matchDocNumber = secondSpans.doc();
if (collectPayloads){
- if (candidateSpan.getPayloads() != null) {
- matchPayload.addAll(candidateSpan.getPayloads());
+ if (candidateSpan.getPayloads() != null) {
+ matchPayload.addAll(candidateSpan.getPayloads());
}
if (secondSpans.isPayloadAvailable()) {
matchPayload.addAll(secondSpans.getPayload());
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/QuantifierSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/QuantifierSpans.java
new file mode 100644
index 0000000..8c99ac0
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/QuantifierSpans.java
@@ -0,0 +1,126 @@
+package de.ids_mannheim.korap.query.spans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.util.Bits;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import de.ids_mannheim.korap.query.SpanQuantifierQuery;
+
+public class QuantifierSpans extends SimpleSpans{
+
+ private int min,max;
+ private long matchCost;
+ private List<CandidateSpan> matchList;
+ private Logger log = LoggerFactory.getLogger(QuantifierSpans.class);
+
+ public QuantifierSpans(SpanQuantifierQuery query,
+ AtomicReaderContext context, Bits acceptDocs,
+ Map<Term, TermContext> termContexts)
+ throws IOException {
+ super(query, context, acceptDocs, termContexts);
+ this.min = query.getMin();
+ this.max = query.getMax();
+ matchList = new ArrayList<CandidateSpan>();
+ hasMoreSpans = firstSpans.next();
+ }
+
+ @Override
+ public boolean next() throws IOException {
+ isStartEnumeration = false;
+ return advance();
+ }
+
+ private boolean advance() throws IOException {
+
+ while (hasMoreSpans || !matchList.isEmpty()){
+ if (!matchList.isEmpty()){
+ setMatchProperties(matchList.get(0));
+ matchList.remove(0);
+ return true;
+ }
+ matchPayload.clear();
+ matchCost = 0;
+ setMatchList();
+ }
+
+ return false;
+ }
+
+ private void setMatchList() throws IOException {
+
+ CandidateSpan startSpan = new CandidateSpan(firstSpans);
+ if (min == 1 ) matchList.add(startSpan);
+
+ if (max == 1) {
+ hasMoreSpans = firstSpans.next();
+ }
+ else {
+ CandidateSpan prevSpan = startSpan;
+ Collection<byte[]> payload;
+ int n = 2;
+ while (n <= max &&
+ (hasMoreSpans = firstSpans.next()) &&
+ startSpan.getDoc() == firstSpans.doc() ){
+ if (firstSpans.start() > prevSpan.getEnd()){
+ break;
+ }
+ else if (min <= n){
+ if (firstSpans.isPayloadAvailable()){
+ payload = firstSpans.getPayload();
+ } else {payload = null;}
+
+ matchCost += firstSpans.cost();
+ matchList.add(new CandidateSpan(
+ startSpan.getStart(),
+ firstSpans.end(),
+ firstSpans.doc(),
+ matchCost,
+ payload)
+ );
+ }
+ prevSpan = new CandidateSpan(firstSpans);
+ n++;
+ }
+ }
+ }
+
+
+ private void setMatchProperties(CandidateSpan candidateSpan)
+ throws IOException {
+ matchDocNumber = candidateSpan.getDoc();
+ matchStartPosition = candidateSpan.getStart();
+ matchEndPosition = candidateSpan.getEnd();
+ if (collectPayloads && candidateSpan.getPayloads() != null) {
+ matchPayload.addAll(candidateSpan.getPayloads());
+ }
+ log.trace("doc# {}, start {}, end {}",matchDocNumber,matchStartPosition,
+ matchEndPosition);
+ }
+
+ @Override
+ public boolean skipTo(int target) throws IOException {
+ if (hasMoreSpans && firstSpans.doc() < target){
+ if (!firstSpans.skipTo(target)){
+ hasMoreSpans = false;
+ return false;
+ }
+ }
+ matchList.clear();
+ return advance();
+ }
+
+ @Override
+ public long cost() {
+ return matchCost;
+ }
+
+}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
index 17bb6ba..350d00e 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
@@ -48,8 +48,9 @@
// Get the enumeration of the two spans to match
firstSpans = simpleSpanQuery.getFirstClause().
getSpans(context, acceptDocs, termContexts);
- secondSpans = simpleSpanQuery.getSecondClause().
- getSpans(context, acceptDocs, termContexts);
+ if (simpleSpanQuery.getSecondClause() != null)
+ secondSpans = simpleSpanQuery.getSecondClause().
+ getSpans(context, acceptDocs, termContexts);
isStartEnumeration=true;
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java
index 3d9fd0a..843fe9d 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedDistanceSpans.java
@@ -95,7 +95,7 @@
if (currentFirstSpan.getEnd() < currentSecondSpan.getEnd() ||
isLastCandidateSmaller(currentFirstSpan, currentSecondSpan)){
-// System.out.println("current target: "+firstSpanList.get(0).getStart() +" "+firstSpanList.get(0).getEnd());
+ log.trace("current target: "+firstSpanList.get(0).getStart() +" "+firstSpanList.get(0).getEnd());
// System.out.println("candidates:");
// for (CandidateSpan cs: secondSpanList) {
// System.out.println(cs.getStart() +" "+ cs.getEnd());
@@ -107,7 +107,7 @@
updateList(firstSpanList);
}
else {
-// System.out.println("current target: "+secondSpanList.get(0).getStart() +" "+secondSpanList.get(0).getEnd());
+ log.trace("current target: "+secondSpanList.get(0).getStart() +" "+secondSpanList.get(0).getEnd());
// System.out.println("candidates:");
// for (CandidateSpan cs: firstSpanList) {
// System.out.println(cs.getStart() +" "+ cs.getEnd());
@@ -116,17 +116,17 @@
matchList = findMatches(currentSecondSpan, firstSpanList);
setMatchSecondSpan(currentSecondSpan);
matchListSpanNum = 1;
- updateList(secondSpanList);
+ updateList(secondSpanList);
}
}
else if (firstSpanList.isEmpty()){
-// System.out.println("current target: "+secondSpanList.get(0).getStart() +" "+secondSpanList.get(0).getEnd());
-// System.out.println("candidates: empty");
+ log.trace("current target: "+secondSpanList.get(0).getStart() +" "+secondSpanList.get(0).getEnd());
+ log.trace("candidates: empty");
updateList(secondSpanList);
}
else{
-// System.out.println("current target: "+firstSpanList.get(0).getStart() +" "+firstSpanList.get(0).getEnd());
-// System.out.println("candidates: empty");
+ log.trace("current target: "+firstSpanList.get(0).getStart() +" "+firstSpanList.get(0).getEnd());
+ log.trace("candidates: empty");
updateList(firstSpanList);
}
}
@@ -201,10 +201,9 @@
setMatchFirstSpan(cs.getChildSpan());
else setMatchSecondSpan(cs.getChildSpan());
- log.trace("Match doc#={} start={} end={}", matchDocNumber,
- matchStartPosition,matchEndPosition);
- //System.out.println("firstspan "+getMatchFirstSpan().getStart()+" "+ getMatchFirstSpan().getEnd());
- //System.out.println("secondspan "+getMatchSecondSpan().getStart()+" "+ getMatchSecondSpan().getEnd());
+ log.trace("Match doc#={} start={} end={}",matchDocNumber,matchStartPosition,matchEndPosition);
+ log.trace("firstspan "+getMatchFirstSpan().getStart()+" "+ getMatchFirstSpan().getEnd());
+ log.trace("secondspan "+getMatchSecondSpan().getStart()+" "+ getMatchSecondSpan().getEnd());
}
@Override
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java
index f733bbb..5f1abd9 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/UnorderedTokenDistanceSpans.java
@@ -35,9 +35,9 @@
ensureSameDoc(firstSpans, secondSpans)){
firstSpanList.add(new CandidateSpan(firstSpans));
secondSpanList.add(new CandidateSpan(secondSpans));
+ currentDocNum = firstSpans.doc();
hasMoreFirstSpans = firstSpans.next();
- hasMoreSecondSpans = secondSpans.next();
- currentDocNum = firstSpans.doc();
+ hasMoreSecondSpans = secondSpans.next();
}
else {
hasMoreSpans = false;
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestElementDistanceIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestElementDistanceIndex.java
index c855fe5..231e2eb 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestElementDistanceIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestElementDistanceIndex.java
@@ -12,6 +12,7 @@
import org.junit.runners.JUnit4;
import de.ids_mannheim.korap.KorapIndex;
+import de.ids_mannheim.korap.KorapQuery;
import de.ids_mannheim.korap.KorapResult;
import de.ids_mannheim.korap.query.SpanDistanceQuery;
import de.ids_mannheim.korap.query.SpanElementQuery;
@@ -184,4 +185,49 @@
}
+ @Test
+ public void testCase5() throws IOException{
+ ki = new KorapIndex();
+ ki.addDocFile(getClass().getResource("/a00/SEP-62389.json.gz").getFile(), true);
+ ki.commit();
+
+// KorapQuery kq = new KorapQuery("tokens");
+ SpanQuery sq = new SpanElementQuery("tokens", "s");
+// //kq.seq(kq.tag("s"), kq.tag("s")).toQuery();
+// System.out.println(sq.toString());
+ //assertEquals("spanNext(<tokens:xip/c:VERB />, <tokens:xip/c:DET />)", sq.toString());
+ kr = ki.search(sq, (short) 100);
+
+
+// for (int i=0; i< kr.getTotalResults(); i++){
+// System.out.println(
+// kr.match(i).getLocalDocID()+" "+
+// kr.match(i).startPos + " " +
+// kr.match(i).endPos
+// );
+// }
+ //System.out.println(sq.toString());
+ sq = new SpanDistanceQuery(
+ new SpanElementQuery("tokens", "s"),
+ new SpanTermQuery(new Term("tokens","s:weg")),
+ new SpanTermQuery(new Term("tokens","s:fahren")),
+ 1,
+ 1,
+ false,
+ true
+ );
+ kr = ki.search(sq, (short) 10);
+
+
+ /* System.out.print(kr.getTotalResults()+"\n");
+ for (int i=0; i< kr.getTotalResults(); i++){
+ System.out.println(
+ kr.match(i).getLocalDocID()+" "+
+ kr.match(i).startPos + " " +
+ kr.match(i).endPos
+ );
+ }*/
+
+ }
+
}
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
index 486065c..0a49a91 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
@@ -125,7 +125,7 @@
"[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" +
"[(1-2)s:b|i:b|_1#1-2]" +
"[(2-3)s:c|i:c|_2#2-3]" +
- "[(3-4)s:a|i:a|_3#3-4|<>:x#3-7$<i>7]" +
+ "[(3-4)s:a|i:a|_3#3-4|<>:x#3-4$<i>4|<>:x#3-7$<i>7]" +
"[(4-5)s:b|i:b|_4#4-5]" +
"[(5-6)s:c|i:c|_5#5-6]" +
"[(6-7)s:a|i:a|_6#6-7]" +
@@ -145,7 +145,7 @@
);
kr = ki.search(sq, (short) 10);
- assertEquals("ab[cabca]bac", kr.match(0).getSnippetBrackets());
+ assertEquals("ab[cabca]bac", kr.match(1).getSnippetBrackets());
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestQuantifierIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestQuantifierIndex.java
new file mode 100644
index 0000000..d07718e
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/index/TestQuantifierIndex.java
@@ -0,0 +1,186 @@
+package de.ids_mannheim.korap.index;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.junit.Test;
+
+import de.ids_mannheim.korap.KorapIndex;
+import de.ids_mannheim.korap.KorapResult;
+import de.ids_mannheim.korap.query.SpanNextQuery;
+import de.ids_mannheim.korap.query.SpanQuantifierQuery;
+
+public class TestQuantifierIndex {
+
+ private KorapIndex ki;
+ private KorapResult kr;
+
+ private FieldDocument createFieldDoc0(){
+ FieldDocument fd = new FieldDocument();
+ fd.addString("ID", "doc-0");
+ fd.addTV("base",
+ "text",
+ "[(0-1)s:c|_1#0-1]" +
+ "[(1-2)s:e|_2#1-2]" +
+ "[(2-3)s:c|_3#2-3|<>:y#2-4$<i>4]" +
+ "[(3-4)s:c|s:b|_4#3-4|<>:x#3-7$<i>7]" +
+ "[(4-5)s:e|s:d|_5#4-5|<>:y#4-6$<i>6]" +
+ "[(5-6)s:c|_6#5-6|<>:y#5-8$<i>8]" +
+ "[(6-7)s:d|_7#6-7]" +
+ "[(7-8)s:e|_8#7-8|<>:x#7-9$<i>9]" +
+ "[(8-9)s:e|s:b|_9#8-9|<>:x#8-10$<i>10]" +
+ "[(9-10)s:d|_10#9-10]");
+ return fd;
+ }
+
+ private FieldDocument createFieldDoc1() {
+ FieldDocument fd = new FieldDocument();
+ fd.addString("ID", "doc-1");
+ fd.addTV("base",
+ "text",
+ "[(0-1)s:b|_1#0-1|<>:s#0-2$<i>1]" +
+ "[(1-2)s:e|_2#1-2|<>:s#1-2$<i>4]" +
+ "[(2-3)s:c|_3#2-3]" +
+ "[(3-4)s:c|s:d|_4#3-4]" +
+ "[(4-5)s:d|_5#4-5|<>:s#4-5$<i>7]" +
+ "[(5-6)s:e|_6#5-6]" +
+ "[(6-7)s:e|_7#6-7]" +
+ "[(7-8)s:c|_8#7-8|<>:x#7-9$<i>9]" +
+ "[(8-9)s:d|_9#8-9|<>:x#8-10$<i>10]" +
+ "[(9-10)s:d|_10#9-10]");
+ return fd;
+ }
+
+ private FieldDocument createFieldDoc2() {
+ FieldDocument fd = new FieldDocument();
+ fd.addString("ID", "doc-2");
+ fd.addTV("base",
+ "text",
+ "[(0-1)s:b|s:c|_1#0-1|<>:s#0-2$<i>1]" +
+ "[(1-2)s:c|_2#1-2]" +
+ "[(2-3)s:b|_3#2-3|<>:s#2-3$<i>3]" +
+ "[(3-4)s:c|_4#3-4|<>:s#3-4$<i>4]" +
+ "[(4-5)s:c|_5#4-5|<>:s#4-5$<i>5]" +
+ "[(5-6)s:b|_6#5-6]" +
+ "[(6-7)s:c|_7#6-7|<>:s#6-7$<i>7]");
+ return fd;
+ }
+
+ private FieldDocument createFieldDoc3() {
+ FieldDocument fd = new FieldDocument();
+ fd.addString("ID", "doc-3");
+ fd.addTV("base",
+ "text",
+ "[(0-1)s:a|_1#0-1|<>:s#0-2$<i>1]" +
+ "[(1-2)s:d|_2#1-2|<>:s#1-2$<i>3]" +
+ "[(2-3)s:e|_3#2-3]");
+ return fd;
+ }
+
+
+ @Test
+ public void testCase1() throws IOException{
+ ki = new KorapIndex();
+ ki.addDoc(createFieldDoc0());
+ ki.commit();
+
+ SpanQuery sq, sq2;
+ // Quantifier only
+ sq = new SpanQuantifierQuery(new SpanTermQuery(new Term("base","s:c")),1,2, true);
+ kr = ki.search(sq, (short) 10);
+ // 0-1, 2-3, 2-4, 3-4, 5-6
+ assertEquals(5,kr.getTotalResults());
+
+ // ec{1,2}
+ sq = new SpanNextQuery(
+ new SpanTermQuery(new Term("base", "s:e")),
+ new SpanQuantifierQuery(new SpanTermQuery(new Term("base","s:c")),1,2, true)
+ );
+
+ kr = ki.search(sq, (short) 10);
+ // 1-3, 1-4, 4-6
+ assertEquals(3,kr.getTotalResults());
+
+ // ec{1,2}d
+ sq2 = new SpanNextQuery(sq, new SpanTermQuery(new Term("base", "s:d")));
+ kr = ki.search(sq2, (short) 10);
+ assertEquals(2,kr.getTotalResults());
+ assertEquals(1, kr.getMatch(0).startPos);
+ assertEquals(5, kr.getMatch(0).endPos);
+ assertEquals(4, kr.getMatch(1).startPos);
+ assertEquals(7, kr.getMatch(1).endPos);
+
+ // Multiple documents
+ ki.addDoc(createFieldDoc1());
+ ki.commit();
+ kr = ki.search(sq2, (short) 10);
+ assertEquals(5,kr.getTotalResults());
+ }
+
+ /** Skip to */
+ @Test
+ public void testCase2() throws IOException{
+ ki = new KorapIndex();
+ ki.addDoc(createFieldDoc0());
+ ki.addDoc(createFieldDoc3());
+ ki.addDoc(createFieldDoc2());
+ ki.addDoc(createFieldDoc1());
+ ki.commit();
+
+ SpanQuery sq;
+ sq = new SpanQuantifierQuery(new SpanTermQuery(new Term("base","s:c")),2,2, true);
+ kr = ki.search(sq, (short) 10);
+ assertEquals(4,kr.getTotalResults());
+
+ kr = ki.search(sq, (short) 10);
+ sq = new SpanNextQuery(
+ new SpanTermQuery(new Term("base", "s:e")),
+ new SpanQuantifierQuery(new SpanTermQuery(new Term("base","s:c")),2,2, true)
+ );
+
+ kr = ki.search(sq, (short) 10);
+ assertEquals(2,kr.getTotalResults());
+ assertEquals(3,kr.getMatch(1).getLocalDocID());
+
+ }
+
+ /** OR */
+ @Test
+ public void testCase3() throws IOException{
+ ki = new KorapIndex();
+ ki.addDoc(createFieldDoc0());
+ ki.commit();
+
+ SpanQuery sq,sq2;
+ // ec{1,2}
+ sq = new SpanNextQuery(
+ new SpanTermQuery(new Term("base", "s:e")),
+ new SpanOrQuery(
+ new SpanQuantifierQuery(new SpanTermQuery(new Term("base","s:c")),1,1, true),
+ new SpanQuantifierQuery(new SpanTermQuery(new Term("base","s:b")),1,1, true)
+ )
+ );
+ kr = ki.search(sq, (short) 10);
+ assertEquals(3,kr.getTotalResults());
+ assertEquals(1, kr.getMatch(0).startPos);
+ assertEquals(3, kr.getMatch(0).endPos);
+ assertEquals(4, kr.getMatch(1).startPos);
+ assertEquals(6, kr.getMatch(1).endPos);
+ assertEquals(7, kr.getMatch(2).startPos);
+ assertEquals(9, kr.getMatch(2).endPos);
+
+// System.out.print(kr.getTotalResults()+"\n");
+// for (int i=0; i< kr.getTotalResults(); i++){
+// System.out.println(
+// kr.match(i).getLocalDocID()+" "+
+// kr.match(i).startPos + " " +
+// kr.match(i).endPos
+// );
+// }
+ }
+}
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestUnorderedDistanceIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestUnorderedDistanceIndex.java
index dac5db2..5ef1c96 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestUnorderedDistanceIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestUnorderedDistanceIndex.java
@@ -115,7 +115,7 @@
SpanQuery sq = createQuery("s:c","s:d",0,3,false);
kr = ki.search(sq, (short) 10);
- assertEquals(5, kr.totalResults());
+ assertEquals(5, kr.totalResults());
}
/** Multiple documents
@@ -186,7 +186,6 @@
assertEquals(6,kr.getMatch(0).getEndPos());
assertEquals(3,kr.getMatch(1).getStartPos());
assertEquals(6,kr.getMatch(1).getEndPos());
-
}
/** ElementQueries */
@@ -298,19 +297,37 @@
assertEquals(3, kr.totalResults());
assertEquals(170, kr.getMatch(0).startPos);
assertEquals(172, kr.getMatch(0).endPos);
+ //System.out.println(kr.getMatch(0).getSnippetBrackets());
+ //System.out.println(kr.getMatch(0).toJSON());
assertEquals(174, kr.getMatch(1).startPos);
assertEquals(176, kr.getMatch(1).endPos);
assertEquals(71, kr.getMatch(2).startPos);
- assertEquals(73, kr.getMatch(2).endPos);
-
-// System.out.print(kr.getTotalResults()+"\n");
-// for (int i=0; i< kr.getTotalResults(); i++){
-// System.out.println(
-// kr.match(i).getLocalDocID()+" "+
-// kr.match(i).startPos + " " +
-// kr.match(i).endPos
-// );
-// }
+ assertEquals(73, kr.getMatch(2).endPos);
+ }
+
+ @Test
+ public void testCase9() throws IOException{
+ ki = new KorapIndex();
+ ki.addDoc(createFieldDoc1());
+ ki.commit();
+ SpanQuery sq = new SpanNextQuery(
+ new SpanTermQuery(new Term("base","s:d")),
+ createQuery("s:c","s:e",1,2,false)
+ );
+ kr = ki.search(sq, (short) 10);
+//
+// System.out.print(kr.getTotalResults()+"\n");
+// for (int i=0; i< kr.getTotalResults(); i++){
+// System.out.println(
+// kr.match(i).getLocalDocID()+" "+
+// kr.match(i).startPos + " " +
+// kr.match(i).endPos
+// );
+// }
+
+ assertEquals(3, kr.totalResults());
+ assertEquals(0,kr.getMatch(1).getStartPos());
+ assertEquals(4,kr.getMatch(1).getEndPos());
}
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestUnorderedElementDistanceIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestUnorderedElementDistanceIndex.java
index 703e96e..fc9793c 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestUnorderedElementDistanceIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestUnorderedElementDistanceIndex.java
@@ -215,4 +215,50 @@
}
+// @Test
+// public void testCase5() throws IOException{
+// ki = new KorapIndex();
+// ki.addDoc(createFieldDoc0());
+// ki.commit();
+// SpanQuery sq, edq;
+// edq = createQuery("s", "s:b", "s:c", 0, 2,false);
+// kr = ki.search(edq, (short) 10);
+//
+// System.out.print(kr.getTotalResults()+"\n");
+// for (int i=0; i< kr.getTotalResults(); i++){
+// System.out.println(
+// kr.match(i).getLocalDocID()+" "+
+// kr.match(i).startPos + " " +
+// kr.match(i).endPos
+// );
+// }
+////
+//// System.out.println("h");
+//// sq = new SpanTermQuery(new Term("base", "s:b"));
+////
+//// kr = ki.search(sq, (short) 10);
+////
+//// System.out.print(kr.getTotalResults()+"\n");
+//// for (int i=0; i< kr.getTotalResults(); i++){
+//// System.out.println(
+//// kr.match(i).getLocalDocID()+" "+
+//// kr.match(i).startPos + " " +
+//// kr.match(i).endPos
+//// );
+//// }
+//
+// sq = new SpanNextQuery(
+// new SpanTermQuery(new Term("base", "s:b"))
+// ,edq);
+// kr = ki.search(sq, (short) 10);
+//
+// System.out.print(kr.getTotalResults()+"\n");
+// for (int i=0; i< kr.getTotalResults(); i++){
+// System.out.println(
+// kr.match(i).getLocalDocID()+" "+
+// kr.match(i).startPos + " " +
+// kr.match(i).endPos
+// );
+// }
+// }
}
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestWPDIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestWPDIndex.java
new file mode 100644
index 0000000..0f897d8
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/index/TestWPDIndex.java
@@ -0,0 +1,218 @@
+package de.ids_mannheim.korap.index;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Properties;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.MMapDirectory;
+import org.junit.Test;
+
+import de.ids_mannheim.korap.KorapCollection;
+import de.ids_mannheim.korap.KorapIndex;
+import de.ids_mannheim.korap.KorapMatch;
+import de.ids_mannheim.korap.KorapResult;
+import de.ids_mannheim.korap.KorapSearch;
+import de.ids_mannheim.korap.filter.BooleanFilter;
+import de.ids_mannheim.korap.query.SpanDistanceQuery;
+import de.ids_mannheim.korap.query.SpanElementQuery;
+import de.ids_mannheim.korap.query.SpanNextQuery;
+import de.ids_mannheim.korap.query.SpanQuantifierQuery;
+
+public class TestWPDIndex {
+ long start, end;
+ KorapIndex ki;
+ KorapResult kr;
+ KorapSearch ks;
+
+ private SpanDistanceQuery createElementDistanceQuery(String e, String x, String y,
+ int min, int max, boolean isOrdered){
+ SpanDistanceQuery sq = new SpanDistanceQuery(
+ new SpanElementQuery("tokens", "s"),
+ new SpanTermQuery(new Term("tokens",x)),
+ new SpanTermQuery(new Term("tokens",y)),
+ min,
+ max,
+ isOrdered,
+ true
+ );
+ return sq;
+ }
+
+ private SpanDistanceQuery createDistanceQuery(String x, String y, int min, int max, boolean isOrdered){
+ SpanDistanceQuery sq = new SpanDistanceQuery(
+ new SpanTermQuery(new Term("tokens",x)),
+ new SpanTermQuery(new Term("tokens",y)),
+ min,
+ max,
+ isOrdered,
+ true
+ );
+ return sq;
+ }
+
+ public TestWPDIndex() throws IOException {
+ InputStream is = getClass().getResourceAsStream("/korap.conf");
+ Properties prop = new Properties();
+ prop.load(is);
+
+ String indexPath = prop.getProperty("lucene.indexDir");
+ MMapDirectory md = new MMapDirectory(new File(indexPath));
+ ki = new KorapIndex(md);
+ }
+
+ /** Token distance spans */
+ @Test
+ public void testCase1() throws IOException{
+ SpanDistanceQuery sq;
+ // ordered
+ sq = createDistanceQuery("s:Wir", "s:kommen", 1, 1, true);
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(8, kr.getTotalResults());
+
+ // unordered
+ sq = createDistanceQuery("s:Wir", "s:kommen", 1, 1, false);
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(11, kr.getTotalResults());
+
+ sq = createDistanceQuery("s:kommen", "s:Wir", 1, 1, false);
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(11, kr.getTotalResults());
+ }
+
+ /** Token exclusion distance spans */
+ @Test
+ public void testCase2() throws IOException{
+
+ SpanQuery q = new SpanTermQuery(new Term("tokens","s:Wir"));
+ ks = new KorapSearch(q);
+ kr = ks.run(ki);
+ assertEquals(1907, kr.getTotalResults());
+
+ SpanDistanceQuery sq;
+ // ordered
+ sq = createDistanceQuery("s:Wir", "s:kommen", 1, 1, true);
+ sq.setExclusion(true);
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(1899, kr.getTotalResults());
+
+ // unordered
+ sq = createDistanceQuery("s:Wir", "s:kommen", 1, 1, false);
+ sq.setExclusion(true);
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(1896, kr.getTotalResults());
+
+// System.out.println(kr.getTotalResults());
+// for (KorapMatch km : kr.getMatches()){
+// System.out.println(km.getDocID() +" "+km.getStartPos() +" "+ km.getEndPos());
+// System.out.println(km.getSnippetBrackets());
+// }
+ }
+
+ /** Element distance spans */
+ @Test
+ public void testCase3() throws IOException{
+ // ordered
+ SpanDistanceQuery sq = createElementDistanceQuery("s","s:weg", "s:fahren", 0, 1, true);
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(3,kr.getTotalResults());
+
+ // unordered
+ sq = createElementDistanceQuery("s","s:weg", "s:fahren", 0, 1, false);
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(5,kr.getTotalResults());
+
+ // only 0
+ sq = createElementDistanceQuery("s","s:weg", "s:fahren", 0, 0, false);
+ kr = ki.search(sq, (short) 100);
+ assertEquals(2,kr.getTotalResults());
+ assertEquals("WPD_BBB.04463", kr.match(0).getDocID());
+ assertEquals(1094,kr.getMatch(0).getStartPos());
+ assertEquals(1115,kr.getMatch(0).getEndPos());
+ assertEquals("WPD_III.00758", kr.match(1).getDocID());
+ assertEquals(444,kr.getMatch(1).getStartPos());
+ assertEquals(451,kr.getMatch(1).getEndPos());
+
+ // only 1
+ sq = createElementDistanceQuery("s","s:weg", "s:fahren", 1, 1, false);
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(3,kr.getTotalResults());
+ }
+
+ /** Element distance exclusion */
+ @Test
+ public void testCase4() throws IOException{
+ SpanDistanceQuery sq = createElementDistanceQuery("s","s:weg", "s:fahren", 1, 1, false);
+ sq.setExclusion(true);
+
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(979,kr.getTotalResults());
+ //0.8s
+
+ // Check if it includes some results
+ BooleanFilter bf = new BooleanFilter();
+ bf.or("ID", "WPD_BBB.04463", "WPD_III.00758");
+ KorapCollection kc = new KorapCollection();
+ kc.filter(bf);
+ ks.setCollection(kc);
+ kr = ks.run(ki);
+ assertEquals(1094,kr.getMatch(0).getStartPos());
+ assertEquals(451,kr.getMatch(1).getEndPos());
+ }
+
+ /** Quantifier */
+ @Test
+ public void testCase5() throws IOException{
+ SpanQuery sq;
+ sq = new SpanQuantifierQuery(new SpanTermQuery(new Term("tokens","mate/p:ADJA")),1,2, true);
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(4116416, kr.getTotalResults());
+ //0.9s
+
+ sq = new SpanQuantifierQuery(new SpanTermQuery(new Term("tokens","mate/p:ADJA")),1,1, true);
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(3879671, kr.getTotalResults());
+
+ sq = new SpanQuantifierQuery(new SpanTermQuery(new Term("tokens","mate/p:ADJA")),2,2, true);
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(236745, kr.getTotalResults());
+ //0.65s
+ }
+
+ /** Next and quantifier */
+ @Test
+ public void testCase6() throws IOException{
+ SpanQuery sq = new SpanNextQuery(
+ new SpanTermQuery(new Term("tokens", "tt/p:NN")),
+ new SpanQuantifierQuery(new SpanTermQuery(new Term("tokens","mate/p:ADJA")),2,2, true)
+ );
+ ks = new KorapSearch(sq);
+ kr = ks.run(ki);
+ assertEquals(30223, kr.getTotalResults());
+ // 1.1s
+
+ SpanQuery sq2 = new SpanNextQuery(sq,
+ new SpanTermQuery(new Term("tokens", "tt/p:NN")));
+ ks = new KorapSearch(sq2);
+ kr = ks.run(ki);
+ assertEquals(26607, kr.getTotalResults());
+ // 1.1s
+ }
+}