Support for Wildcard queries
diff --git a/CHANGES b/CHANGES
index 7fe775f..fe98544 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,8 @@
+0.25 2013-12-20
+ - Support for Wildcard Queries.
+ - Improved support for regular expressions.
+ - Introduced keyword fields that store no positions.
+
0.24_1 2013-12-05
- This is a pseudo version for demo versions with dirty hacks,
meant to be rolled back!
diff --git a/pom.xml b/pom.xml
index c38d036..dd12c85 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
<groupId>KorAP-modules</groupId>
<artifactId>KorAP-lucene-index</artifactId>
- <version>0.24</version>
+ <version>0.25</version>
<packaging>jar</packaging>
<name>KorAP-lucene-index</name>
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 674dee8..9359848 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -12,6 +12,7 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
@@ -500,6 +501,7 @@
log.trace("Start search");
this.termContexts = new HashMap<Term, TermContext>();
+
SpanQuery query = ks.getQuery();
String foundry = query.getField();
@@ -537,6 +539,11 @@
try {
+ // Rewrite query
+ for (Query rewrittenQuery = query.rewrite(this.reader()); rewrittenQuery != (Query) query; rewrittenQuery = query.rewrite(this.reader())) {
+ query = (SpanQuery) rewrittenQuery;
+ };
+
for (AtomicReaderContext atomic : this.reader().leaves()) {
// Use OpenBitSet;
diff --git a/src/main/java/de/ids_mannheim/korap/KorapQuery.java b/src/main/java/de/ids_mannheim/korap/KorapQuery.java
index b0da010..c011d6e 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapQuery.java
@@ -239,7 +239,7 @@
- // SpanSegmentRegexQuery
+ // SpanRegexQueryWrapper
/**
* Create a query object based on a regular expression.
* @param re The regular expession as a string.
@@ -257,7 +257,6 @@
return new SpanRegexQueryWrapper(this.field, re, flags, false);
};
-
/**
* Create a query object based on a regular expression.
* @param re The regular expession as a string.
@@ -268,7 +267,6 @@
return new SpanRegexQueryWrapper(this.field, re, flags, caseinsensitive);
};
-
/**
* Create a query object based on a regular expression.
* @param re The regular expession as a string.
@@ -278,6 +276,24 @@
return new SpanRegexQueryWrapper(this.field, re, RegExp.ALL, caseinsensitive);
};
+ // SpanWildcardQueryWrapper
+ /**
+ * Create a query object based on a wildcard term.
+ * @param wc The wildcard term as a string.
+ */
+ public SpanWildcardQueryWrapper wc (String wc) {
+ return new SpanWildcardQueryWrapper(this.field, wc, false);
+ };
+
+ /**
+ * Create a query object based on a wildcard term.
+ * @param wc The wildcard term as a string.
+ * @param caseinsensitive A boolean value indicating case insensitivity.
+ */
+ public SpanWildcardQueryWrapper wc (String wc, boolean caseinsensitive) {
+ return new SpanWildcardQueryWrapper(this.field, wc, caseinsensitive);
+ };
+
// SpanSegmentQueries
/**
diff --git a/src/main/java/de/ids_mannheim/korap/KorapSearch.java b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
index 57a4ae6..d7401ed 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapSearch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
@@ -160,10 +160,22 @@
this.query = sq;
};
+ public KorapSearch () { };
+
public SpanQuery getQuery () {
return this.query;
};
+ public KorapSearch setQuery (SpanQueryWrapperInterface sqwi) {
+ this.query = sqwi.toQuery();
+ return this;
+ };
+
+ public KorapSearch setQuery (SpanQuery sq) {
+ this.query = sq;
+ return this;
+ };
+
public int getStartIndex () {
return this.startIndex;
};
@@ -248,6 +260,12 @@
};
public KorapResult run (KorapIndex ki) {
+ if (this.query == null) {
+ KorapResult kr = new KorapResult();
+ kr.setError(this.getClass() + " expects a query");
+ return kr;
+ };
+
if (this.error != null) {
KorapResult kr = new KorapResult();
kr.setError(this.error);
diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index 8711c0c..99a0cab 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
@@ -39,20 +39,25 @@
public Document doc = new Document();
- private FieldType tvField = new FieldType(TextField.TYPE_STORED);
+ private FieldType tvField = new FieldType(TextField.TYPE_STORED);
private FieldType tvNoField = new FieldType(TextField.TYPE_NOT_STORED);
-
- // private HashMap<String, MultiTermTokenStream> termFields;
+ private FieldType keywords = new FieldType(TextField.TYPE_STORED);
{
tvField.setStoreTermVectors(true);
tvField.setStoreTermVectorPositions(true);
tvField.setStoreTermVectorPayloads(true);
+ tvField.setStoreTermVectorOffsets(false);
tvNoField.setStoreTermVectors(true);
tvNoField.setStoreTermVectorPositions(true);
tvNoField.setStoreTermVectorPayloads(true);
- // termFields = new HashMap<String, MultiTermTokenStream>();
+ tvNoField.setStoreTermVectorOffsets(false);
+
+ keywords.setStoreTermVectors(true);
+ keywords.setStoreTermVectorPositions(false);
+ keywords.setStoreTermVectorPayloads(false);
+ keywords.setStoreTermVectorOffsets(false);
}
// see http://www.cowtowncoder.com/blog/archives/2011/07/entry_457.html
@@ -94,6 +99,10 @@
doc.add(new TextField(key, value, Field.Store.YES));
};
+ public void addKeyword (String key, String value) {
+ doc.add(new Field(key, value, keywords));
+ };
+
public void addString (String key, String value) {
doc.add(new StringField(key, value, Field.Store.YES));
};
@@ -106,7 +115,6 @@
doc.add(new StoredField(key, value));
};
-
public void addTV (String key, String value, String tsString) {
this.addTV(key, value, new MultiTermTokenStream(tsString));
};
@@ -163,8 +171,9 @@
// Store this information as well as tokenization information
// as meta fields in the tokenization term vector
if (field.containsKey("foundries")) {
+ // TODO: Do not store positions!
String foundries = (String) field.get("foundries");
- this.addText("foundries", foundries);
+ this.addKeyword("foundries", foundries);
super.setFoundries(foundries);
};
if (field.containsKey("tokenization")) {
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanRegexQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanRegexQueryWrapper.java
index 8152a6f..3cd4051 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanRegexQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanRegexQueryWrapper.java
@@ -28,11 +28,11 @@
if (re.startsWith("s:")) {
re = re.replaceFirst("s:", "i:");
};
- // TODO: This may break things like \N
re = re.toLowerCase();
};
RegexpQuery requery = new RegexpQuery(new Term(field, re), flags);
query = new SpanMultiTermQueryWrapper<RegexpQuery>( requery );
+
};
public SpanQuery toQuery() {
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanWildcardQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanWildcardQueryWrapper.java
new file mode 100644
index 0000000..7289bfa
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanWildcardQueryWrapper.java
@@ -0,0 +1,31 @@
+package de.ids_mannheim.korap.query.wrap;
+
+import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.index.Term;
+
+import java.util.*;
+
+public class SpanWildcardQueryWrapper {
+ private SpanQuery query;
+
+ public SpanWildcardQueryWrapper (String field, String wc) {
+ this(field, wc, false);
+ };
+
+ public SpanWildcardQueryWrapper (String field, String wc, boolean caseinsensitive) {
+ if (caseinsensitive) {
+ if (wc.startsWith("s:")) {
+ wc = wc.replaceFirst("s:", "i:");
+ };
+ wc = wc.toLowerCase();
+ };
+ WildcardQuery wcquery = new WildcardQuery(new Term(field, wc));
+ query = new SpanMultiTermQueryWrapper<WildcardQuery>( wcquery );
+ };
+
+ public SpanQuery toQuery() {
+ return this.query;
+ };
+};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
index 9b9aae7..badc261 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
@@ -25,8 +25,6 @@
import org.apache.lucene.index.Term;
-// mvn -Dtest=TestWithinIndex#indexExample1 test
-
@RunWith(JUnit4.class)
public class TestNextIndex {
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestRegexWildcardIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestRegexWildcardIndex.java
new file mode 100644
index 0000000..4f2936e
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/index/TestRegexWildcardIndex.java
@@ -0,0 +1,242 @@
+import java.util.*;
+import java.io.*;
+
+import org.apache.lucene.util.Version;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Bits;
+
+import static org.junit.Assert.*;
+import org.junit.Test;
+import org.junit.Ignore;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import de.ids_mannheim.korap.KorapIndex;
+import de.ids_mannheim.korap.KorapQuery;
+import de.ids_mannheim.korap.KorapResult;
+import de.ids_mannheim.korap.KorapSearch;
+import de.ids_mannheim.korap.index.FieldDocument;
+import de.ids_mannheim.korap.analysis.MultiTermTokenStream;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+
+@RunWith(JUnit4.class)
+public class TestRegexWildcardIndex {
+
+ @Test
+ public void indexRegex () throws IOException {
+ KorapIndex ki = new KorapIndex();
+
+ // abcabcabac
+ FieldDocument fd = new FieldDocument();
+ fd.addTV("base",
+ "affe afffe baum baumgarten steingarten franz hans haus efeu effe",
+ "[(0-4)s:affe|_0#0-4|-:t$<i>10]" +
+ "[(5-10)s:afffe|_1#5-10]" +
+ "[(11-15)s:baum|_2#11-15]" +
+ "[(16-26)s:baumgarten|_3#16-26]" +
+ "[(27-38)s:steingarten|_4#27-38]" +
+ "[(39-44)s:franz|_5#39-44]" +
+ "[(45-49)s:hans|_6#45-49]" +
+ "[(50-54)s:haus|_7#50-54]" +
+ "[(55-59)s:efeu|_8#55-59]" +
+ "[(60-64)s:effe|_9#60-64]");
+ ki.addDoc(fd);
+
+ ki.commit();
+
+ KorapQuery kq = new KorapQuery("base");
+ SpanQuery sq = kq.re("s:af*e").toQuery();
+ assertEquals("SpanMultiTermQueryWrapper(base:/s:af*e/)", sq.toString());
+
+ KorapSearch ks = new KorapSearch(sq);
+ ks.leftContext.setToken(true).setLength(1);
+ ks.rightContext.setToken(true).setLength(1);
+
+ KorapResult kr = ki.search(ks);
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("affe [afffe] baum ...", kr.getMatch(1).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:baum.*").toQuery()));
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("... afffe [baum] baumgarten ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(1).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.....?garten").toQuery()));
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("... baumgarten [steingarten] franz ...", kr.getMatch(1).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:ha.s").toQuery()));
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("... franz [hans] haus ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("... hans [haus] efeu ...", kr.getMatch(1).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.*ff.*").toQuery()));
+ assertEquals(3, kr.getTotalResults());
+ assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("affe [afffe] baum ...", kr.getMatch(1).getSnippetBrackets());
+ assertEquals("... efeu [effe]", kr.getMatch(2).getSnippetBrackets());
+ };
+
+ @Test
+ public void indexWildcard () throws IOException {
+ KorapIndex ki = new KorapIndex();
+
+ // abcabcabac
+ FieldDocument fd = new FieldDocument();
+ fd.addTV("base",
+ "affe afffe baum baumgarten steingarten franz hans haus efeu effe",
+ "[(0-4)s:affe|_0#0-4|-:t$<i>10]" +
+ "[(5-10)s:afffe|_1#5-10]" +
+ "[(11-15)s:baum|_2#11-15]" +
+ "[(16-26)s:baumgarten|_3#16-26]" +
+ "[(27-38)s:steingarten|_4#27-38]" +
+ "[(39-44)s:franz|_5#39-44]" +
+ "[(45-49)s:hans|_6#45-49]" +
+ "[(50-54)s:haus|_7#50-54]" +
+ "[(55-59)s:efeu|_8#55-59]" +
+ "[(60-64)s:effe|_9#60-64]");
+ ki.addDoc(fd);
+
+ ki.commit();
+
+ KorapQuery kq = new KorapQuery("base");
+ SpanQuery sq = kq.wc("s:af*e").toQuery();
+ assertEquals("SpanMultiTermQueryWrapper(base:s:af*e)", sq.toString());
+
+ KorapSearch ks = new KorapSearch(sq);
+ ks.leftContext.setToken(true).setLength(1);
+ ks.rightContext.setToken(true).setLength(1);
+
+ KorapResult kr = ki.search(ks);
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("affe [afffe] baum ...", kr.getMatch(1).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:baum.*").toQuery()));
+ assertEquals(0, kr.getTotalResults());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:baum*").toQuery()));
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("... afffe [baum] baumgarten ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(1).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:*garten").toQuery()));
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("... baumgarten [steingarten] franz ...", kr.getMatch(1).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:ha?s").toQuery()));
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("... franz [hans] haus ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("... hans [haus] efeu ...", kr.getMatch(1).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:?ff?").toQuery()));
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("... efeu [effe]", kr.getMatch(1).getSnippetBrackets());
+ };
+
+ @Test
+ public void indexRegexCaseInsensitive () throws IOException {
+ KorapIndex ki = new KorapIndex();
+
+ // abcabcabac
+ FieldDocument fd = new FieldDocument();
+ fd.addTV("base",
+ "AfFe aFfFE Baum Baumgarten SteinGarten franZ HaNs Haus Efeu effe",
+ "[(0-4)s:AfFe|i:affe|_0#0-4|-:t$<i>10]" +
+ "[(5-10)s:aFfFE|i:afffe|_1#5-10]" +
+ "[(11-15)s:Baum|i:baum|_2#11-15]" +
+ "[(16-26)s:Baumgarten|i:baumgarten|_3#16-26]" +
+ "[(27-38)s:SteinGarten|i:steingarten|_4#27-38]" +
+ "[(39-44)s:franZ|i:franz|_5#39-44]" +
+ "[(45-49)s:HaNs|i:hans|_6#45-49]" +
+ "[(50-54)s:Haus|i:haus|_7#50-54]" +
+ "[(55-59)s:Efeu|i:efeu|_8#55-59]" +
+ "[(60-64)s:effe|i:effe|_9#60-64]");
+ ki.addDoc(fd);
+
+ ki.commit();
+
+ KorapQuery kq = new KorapQuery("base");
+ SpanQuery sq = kq.re("s:Af*e", true).toQuery();
+ assertEquals("SpanMultiTermQueryWrapper(base:/i:af*e/)", sq.toString());
+
+ KorapSearch ks = new KorapSearch(sq);
+ ks.leftContext.setToken(true).setLength(1);
+ ks.rightContext.setToken(true).setLength(1);
+
+ KorapResult kr = ki.search(ks);
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("[AfFe] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("AfFe [aFfFE] Baum ...", kr.getMatch(1).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:Af.*e").toQuery()));
+ assertEquals(1, kr.getTotalResults());
+ assertEquals("[AfFe] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:baum.*", true).toQuery()));
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("... aFfFE [Baum] Baumgarten ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("... Baum [Baumgarten] SteinGarten ...", kr.getMatch(1).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.*garten", true).toQuery()));
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("... Baum [Baumgarten] SteinGarten ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("... Baumgarten [SteinGarten] franZ ...", kr.getMatch(1).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.*garten", false).toQuery()));
+ assertEquals(1, kr.getTotalResults());
+ assertEquals("... Baum [Baumgarten] SteinGarten ...", kr.getMatch(0).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:ha.s", true).toQuery()));
+ assertEquals(2, kr.getTotalResults());
+ assertEquals("... franZ [HaNs] Haus ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("... HaNs [Haus] Efeu ...", kr.getMatch(1).getSnippetBrackets());
+
+ kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.*f*e", true).toQuery()));
+ assertEquals(3, kr.getTotalResults());
+ assertEquals("[AfFe] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("AfFe [aFfFE] Baum ...", kr.getMatch(1).getSnippetBrackets());
+ assertEquals("... Efeu [effe]", kr.getMatch(2).getSnippetBrackets());
+ };
+
+ @Test
+ public void indexRegexCombined () throws IOException {
+ KorapIndex ki = new KorapIndex();
+
+ // abcabcabac
+ FieldDocument fd = new FieldDocument();
+ fd.addTV("base",
+ "affe afffe baum baumgarten steingarten franz hans haus efeu effe",
+ "[(0-4)s:affe|_0#0-4|-:t$<i>10]" +
+ "[(5-10)s:afffe|_1#5-10]" +
+ "[(11-15)s:baum|_2#11-15]" +
+ "[(16-26)s:baumgarten|_3#16-26]" +
+ "[(27-38)s:steingarten|_4#27-38]" +
+ "[(39-44)s:franz|_5#39-44]" +
+ "[(45-49)s:hans|_6#45-49]" +
+ "[(50-54)s:haus|_7#50-54]" +
+ "[(55-59)s:efeu|_8#55-59]" +
+ "[(60-64)s:effe|_9#60-64]");
+ ki.addDoc(fd);
+
+ ki.commit();
+
+ KorapQuery kq = new KorapQuery("base");
+ SpanQuery sq = kq.seq(kq.seg("s:affe")).append(kq.re("s:af*e")).toQuery();
+ assertEquals("spanNext(base:s:affe, SpanMultiTermQueryWrapper(base:/s:af*e/))", sq.toString());
+
+ KorapSearch ks = new KorapSearch(sq);
+ ks.leftContext.setToken(true).setLength(1);
+ ks.rightContext.setToken(true).setLength(1);
+
+ KorapResult kr = ki.search(ks);
+ assertEquals(1, kr.getTotalResults());
+ assertEquals("[affe afffe] baum ...", kr.getMatch(0).getSnippetBrackets());
+ };
+};