Fixed #179
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/ExpandedSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/ExpandedSpans.java
index 245df4d..3fd0afe 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/ExpandedSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/ExpandedSpans.java
@@ -41,11 +41,13 @@
public boolean next() throws IOException {
matchPayload.clear();
isStartEnumeration = false;
+ if (candidateSpans.size() == 0 && hasMoreSpans)
+ hasMoreSpans = firstSpans.next();
return advance();
}
private boolean advance() throws IOException {
- while (candidateSpans.size() > 0 || (hasMoreSpans = firstSpans.next())) {
+ while (candidateSpans.size() > 0 || hasMoreSpans) {
if (candidateSpans.size() > 0 ){
setMatch(candidateSpans.get(0));
candidateSpans.remove(0);
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
index 6173ad6..0a90445 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
@@ -2,9 +2,12 @@
import static org.junit.Assert.*;
+import java.io.BufferedReader;
+import java.io.FileReader;
import java.io.IOException;
import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.util.automaton.RegExp;
@@ -14,10 +17,13 @@
import de.ids_mannheim.korap.KorapIndex;
import de.ids_mannheim.korap.KorapMatch;
+import de.ids_mannheim.korap.KorapQuery;
import de.ids_mannheim.korap.KorapResult;
import de.ids_mannheim.korap.query.SpanElementQuery;
import de.ids_mannheim.korap.query.SpanExpansionQuery;
import de.ids_mannheim.korap.query.SpanRepetitionQuery;
+import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
+import de.ids_mannheim.korap.util.QueryException;
public class TestSpanExpansionIndex {
@@ -253,6 +259,37 @@
assertEquals(1, kr.getMatch(4).getStartPos());
assertEquals(4, kr.getMatch(4).getEndPos());
}
+
+ /** Skip to
+ * */
+ @Test
+ public void testCase7() throws IOException, QueryException{
+ KorapIndex ki = new KorapIndex();
+ ki.addDocFile(
+ getClass().getResource("/wiki/00001.json.gz").getFile(),true);
+ ki.addDocFile(
+ getClass().getResource("/wiki/00002.json.gz").getFile(),true);
+ ki.commit();
+
+ String jsonPath = getClass().getResource("/queries/poly3.json").getFile();
+ String jsonQuery = readFile(jsonPath);
+ SpanQueryWrapper sqwi = new KorapQuery("tokens").fromJSON(
+ jsonQuery
+ );
+
+ SpanQuery sq = sqwi.toQuery();
+ //System.out.println(sq.toString());
+ kr = ki.search(sq, (short) 20);
+
+ assertEquals(205, kr.getMatch(0).getStartPos());
+ assertEquals(208, kr.getMatch(0).getEndPos());
+
+ /*for (KorapMatch km : kr.getMatches()){
+ System.out.println(km.getStartPos() +","+km.getEndPos()+" "
+ +km.getSnippetBrackets()
+ );
+ }*/
+ }
/**
@@ -261,29 +298,31 @@
* */
@Test
public void testQueryRewriteBug() throws IOException {
- KorapIndex ki = new KorapIndex();
- ki.addDoc(createFieldDoc0()); // same doc
- ki.addDoc(createFieldDoc1()); // only not clause
- ki.addDoc(createFieldDoc2()); // only main clause
- ki.commit();
-
- // See /queries/bugs/repetition_group_rewrite
- // spanRepetition(spanExpansion(
- // SpanMultiTermQueryWrapper(tokens:/cnx/p:A/), []{1, 1}, right){2,2}
- // )
- RegexpQuery requery = new RegexpQuery(new Term("base", "s:[ac]"), RegExp.ALL);
- SpanMultiTermQueryWrapper<RegexpQuery> query =
- new SpanMultiTermQueryWrapper<RegexpQuery>( requery );
- SpanExpansionQuery seq = new SpanExpansionQuery(query, 1, 1, 1, true);
- SpanRepetitionQuery rep = new SpanRepetitionQuery(seq, 2, 2, true);
+ KorapIndex ki = new KorapIndex();
+ ki.addDoc(createFieldDoc0()); // same doc
+ ki.addDoc(createFieldDoc1()); // only not clause
+ ki.addDoc(createFieldDoc2()); // only main clause
+ ki.commit();
- try {
- ki.search(rep, (short) 20);
- }
- catch (Exception e) {
- fail(e.getMessage());
- };
- };
+ // See /queries/bugs/repetition_group_rewrite
+ // spanRepetition(spanExpansion(
+ // SpanMultiTermQueryWrapper(tokens:/cnx/p:A/), []{1, 1}, right){2,2}
+ // )
+ RegexpQuery requery = new RegexpQuery(new Term("base", "s:[ac]"), RegExp.ALL);
+ SpanMultiTermQueryWrapper<RegexpQuery> query =
+ new SpanMultiTermQueryWrapper<RegexpQuery>( requery );
+ SpanExpansionQuery seq = new SpanExpansionQuery(query, 1, 1, 1, true);
+ SpanRepetitionQuery rep = new SpanRepetitionQuery(seq, 2, 2, true);
+
+ kr = ki.search(rep, (short) 20);
+ assertEquals(3,kr.getTotalResults());
+
+ /*for (KorapMatch km : kr.getMatches()){
+ System.out.println(km.getStartPos() +","+km.getEndPos()+" "
+ +km.getSnippetBrackets()
+ );
+ }*/
+ }
private FieldDocument createFieldDoc0(){
@@ -309,12 +348,12 @@
fd.addString("ID", "doc-1");
fd.addTV("base",
"bbccdd",
- "[(0-1)s:b|s:c|_1#0-1]" +
- "[(1-2)s:b|_2#1-2]" +
- "[(2-3)s:c|_3#2-3]" +
- "[(3-4)s:c|_4#3-4]" +
- "[(4-5)s:d|_5#4-5]" +
- "[(5-6)s:d|_6#5-6]");
+ "[(0-1)s:b|s:c|_0#0-1]" +
+ "[(1-2)s:b|_1#1-2]" +
+ "[(2-3)s:c|_2#2-3]" +
+ "[(3-4)s:c|_3#3-4]" +
+ "[(4-5)s:d|_4#4-5]" +
+ "[(5-6)s:d|_5#5-6]");
return fd;
}
@@ -322,13 +361,28 @@
FieldDocument fd = new FieldDocument();
fd.addString("ID", "doc-2");
fd.addTV("base",
- "text",
- "[(0-1)s:b|s:c|_1#0-1]" +
- "[(1-2)s:e|_2#1-2]" +
- "[(2-3)s:c|_3#2-3]" +
- "[(3-4)s:c|_4#3-4]" +
- "[(4-5)s:e|_5#4-5]" +
- "[(5-6)s:a|_6#5-6]");
+ "beccea",
+ "[(0-1)s:b|s:c|_0#0-1]" +
+ "[(1-2)s:e|_1#1-2]" +
+ "[(2-3)s:c|_2#2-3]" +
+ "[(3-4)s:c|_3#3-4]" +
+ "[(4-5)s:e|_4#4-5]" +
+ "[(5-6)s:a|_5#5-6]");
return fd;
}
+
+ private String readFile(String path) {
+ StringBuilder sb = new StringBuilder();
+ try {
+ BufferedReader in = new BufferedReader(new FileReader(path));
+ String str;
+ while ((str = in.readLine()) != null) {
+ sb.append(str);
+ };
+ in.close();
+ } catch (IOException e) {
+ fail(e.getMessage());
+ }
+ return sb.toString();
+ }
}
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestTemporaryQueryLimitations.java b/src/test/java/de/ids_mannheim/korap/query/TestTemporaryQueryLimitations.java
index f8a2963..5120bab 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestTemporaryQueryLimitations.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestTemporaryQueryLimitations.java
@@ -1,4 +1,6 @@
-package de.ids_mannheim.korap.highlight;
+package de.ids_mannheim.korap.query;
+
+//package de.ids_mannheim.korap.highlight;
import java.util.*;
import java.io.IOException;
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java b/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java
index 180d3ce..8765457 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java
@@ -943,82 +943,84 @@
*/
@Test
public void searchJSONexpansionBug () throws IOException {
- // Construct index
- KorapIndex ki = new KorapIndex();
- // Indexing test files
- ki.addDocFile(
- getClass().getResource("/wiki/00002.json.gz").getFile(), true
- );
- ki.commit();
-
- // Expansion bug
- // der alte Digraph Aa durch Å
- String json = getString(
- getClass().getResource("/queries/bugs/expansion_bug_2.jsonld").getFile()
- );
-
- KorapResult kr = new KorapSearch(json).run(ki);
- assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
- "[der alte Digraph Aa durch Å] ersetzt worden, " +
- "in Eigennamen und Ortsnamen ...",
- kr.getMatch(0).getSnippetBrackets());
- assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
- assertEquals(1, kr.getTotalResults());
-
- // der alte Digraph Aa durch []
- // Works with one document
- json = getString(
- getClass().getResource("/queries/bugs/expansion_bug.jsonld").getFile()
- );
-
- kr = new KorapSearch(json).run(ki);
- assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
- "[der alte Digraph Aa durch Å] ersetzt worden, " +
- "in Eigennamen und Ortsnamen ...",
- kr.getMatch(0).getSnippetBrackets());
- assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
- assertEquals(1, kr.getTotalResults());
-
- // Now try with one file ahead
- ki = new KorapIndex();
- for (String i : new String[] {"00001",
- "00002"}) {
- ki.addDocFile(
- getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
- );
- };
- ki.commit();
-
- // Expansion bug
- // der alte Digraph Aa durch Å
- json = getString(
- getClass().getResource("/queries/bugs/expansion_bug_2.jsonld").getFile()
- );
-
- kr = new KorapSearch(json).run(ki);
- assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
- "[der alte Digraph Aa durch Å] ersetzt worden, " +
- "in Eigennamen und Ortsnamen ...",
- kr.getMatch(0).getSnippetBrackets());
- assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
- assertEquals(1, kr.getTotalResults());
-
- // der alte Digraph Aa durch []
- json = getString(
- getClass().getResource("/queries/bugs/expansion_bug.jsonld").getFile()
- );
-
- kr = new KorapSearch(json).run(ki);
-
- if (kr.getTotalResults() != 1)
- fail("Expansion fails");
-
- assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
- "[der alte Digraph Aa durch Å] ersetzt worden, " +
- "in Eigennamen und Ortsnamen ...",
- kr.getMatch(0).getSnippetBrackets());
- assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
- assertEquals(1, kr.getTotalResults());
+ // Construct index
+ KorapIndex ki = new KorapIndex();
+ // Indexing test files
+ ki.addDocFile(
+ getClass().getResource("/wiki/00002.json.gz").getFile(), true
+ );
+ ki.commit();
+
+ // Expansion bug
+ // der alte Digraph Aa durch Å
+ String json = getString(
+ getClass().getResource("/queries/bugs/expansion_bug_2.jsonld").getFile()
+ );
+
+ KorapResult kr = new KorapSearch(json).run(ki);
+ assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
+ "[der alte Digraph Aa durch Å] ersetzt worden, " +
+ "in Eigennamen und Ortsnamen ...",
+ kr.getMatch(0).getSnippetBrackets());
+ assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
+ assertEquals(1, kr.getTotalResults());
+
+ // der alte Digraph Aa durch []
+ // Works with one document
+ json = getString(
+ getClass().getResource("/queries/bugs/expansion_bug.jsonld").getFile()
+ );
+
+ kr = new KorapSearch(json).run(ki);
+
+ assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
+ "[der alte Digraph Aa durch Å] ersetzt worden, " +
+ "in Eigennamen und Ortsnamen ...",
+ kr.getMatch(0).getSnippetBrackets());
+ assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
+ assertEquals(1, kr.getTotalResults());
+
+ // Now try with one file ahead
+ ki = new KorapIndex();
+ for (String i : new String[] {"00001",
+ "00002"}) {
+ ki.addDocFile(
+ getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
+ );
+ };
+ ki.commit();
+
+ // Expansion bug
+ // der alte Digraph Aa durch Å
+ json = getString(
+ getClass().getResource("/queries/bugs/expansion_bug_2.jsonld").getFile()
+ );
+
+ kr = new KorapSearch(json).run(ki);
+
+ assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
+ "[der alte Digraph Aa durch Å] ersetzt worden, " +
+ "in Eigennamen und Ortsnamen ...",
+ kr.getMatch(0).getSnippetBrackets());
+ assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
+ assertEquals(1, kr.getTotalResults());
+
+ // der alte Digraph Aa durch []
+ json = getString(
+ getClass().getResource("/queries/bugs/expansion_bug.jsonld").getFile()
+ );
+
+ kr = new KorapSearch(json).run(ki);
+ System.out.println(kr.getQuery());
+// if (kr.getTotalResults() != 1)
+// fail("Expansion fails");
+
+ assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
+ "[der alte Digraph Aa durch Å] ersetzt worden, " +
+ "in Eigennamen und Ortsnamen ...",
+ kr.getMatch(0).getSnippetBrackets());
+ assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
+ assertEquals(1, kr.getTotalResults());
};
diff --git a/src/test/resources/queries/poly3.json b/src/test/resources/queries/poly3.json
new file mode 100644
index 0000000..45492c7
--- /dev/null
+++ b/src/test/resources/queries/poly3.json
@@ -0,0 +1,45 @@
+{
+ "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.2/context.jsonld",
+ "collection" : {},
+ "collections" : [
+ {
+ "@type" : "korap:meta-filter",
+ "@value" : {
+ "@field" : "korap:field#corpusID",
+ "@type" : "korap:term",
+ "@value" : "WPD"
+ }
+ }
+ ],
+ "errors" : [],
+ "messages" : [],
+ "meta" : {},
+ "query" : {
+ "@type" : "korap:group",
+ "operands" : [
+ {
+ "@type" : "korap:token",
+ "wrap" : {
+ "@type" : "korap:term",
+ "key" : "Aa",
+ "layer" : "orth",
+ "match" : "match:eq"
+ }
+ },
+ {
+ "@type" : "korap:token",
+ "wrap" : {
+ "@type" : "korap:term",
+ "key" : "durch",
+ "layer" : "orth",
+ "match" : "match:eq"
+ }
+ },
+ {
+ "@type" : "korap:token"
+ }
+ ],
+ "operation" : "operation:sequence"
+ },
+ "warnings" : []
+}
\ No newline at end of file