Added text search functionality to collection builder
Change-Id: I16d45bb7651763e6f41857c0839962bd14a1f0af
diff --git a/Changes b/Changes
index 1ed8d2e..7ed178a 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.57 2018-03-28
+ - [feature] Support text queries in metadata (diewald)
+
0.56.2 2018-03-23
- [feature] Introduce meta field retrieval method (diewald)
- [cleanup] Rename KrillQuery's "_"-method to "nr" to improve
diff --git a/pom.xml b/pom.xml
index cb67ac4..1094693 100644
--- a/pom.xml
+++ b/pom.xml
@@ -35,7 +35,7 @@
<groupId>de.ids_mannheim.korap</groupId>
<artifactId>Krill</artifactId>
- <version>0.56.2</version>
+ <version>0.57.0</version>
<packaging>jar</packaging>
<name>Krill</name>
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index b2d326e..5d405eb 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
@@ -227,7 +227,8 @@
return this.cb.term(key, json.get("value").asText())
.not();
- // This may change - but for now it means the elements are lowercased
+ // TODO:
+ // This needs to change - but for now it means the elements are lowercased
case "match:contains":
return this.cb.term(key,
json.get("value").asText().toLowerCase());
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
index 4716804..b688aea 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
@@ -2,11 +2,16 @@
import java.util.*;
import java.io.IOException;
+import java.io.StringReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsFilter;
import org.apache.lucene.search.*;
import org.apache.lucene.search.NumericRangeFilter;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
import de.ids_mannheim.korap.util.KrillDate;
import org.slf4j.Logger;
@@ -17,9 +22,12 @@
/*
* TODO: Optimize!
- * - Remove identical object in Boolean groups
- * - Flatten boolean groups
- * - create "between" ranges for multiple date objects
+ * - Remove identical object in Boolean groups
+ * - Flatten boolean groups
+ * - create "between" ranges for multiple date objects
+ *
+ * TODO:
+ * - Filters are deprecated, they should be ported to queries
*/
public class CollectionBuilder {
@@ -42,6 +50,11 @@
};
+ public CollectionBuilder.Interface text (String field, String text) {
+ return new CollectionBuilder.Text(field, text);
+ };
+
+
public CollectionBuilder.Interface since (String field, String date) {
int since = new KrillDate(date).floor();
@@ -185,6 +198,66 @@
};
};
+
+ public class Text implements CollectionBuilder.Interface {
+ private boolean isNegative = false;
+ // private boolean regex = false;
+ private String field;
+ private String text;
+
+
+ public Text (String field, String text) {
+ this.field = field;
+ this.text = text;
+ };
+
+ // TODO:
+ // Currently this treatment is language specific and
+ // does too mzch, I guess.
+ public Filter toFilter () {
+ StringReader reader = new StringReader(this.text);
+ GermanAnalyzer ga = new GermanAnalyzer();
+ PhraseQuery pq = new PhraseQuery();
+ int pos = 0;
+ try {
+ TokenStream ts = ga.tokenStream(this.field , reader);
+ CharTermAttribute term;
+ ts.reset();
+ while (ts.incrementToken()) {
+ term = ts.getAttribute(CharTermAttribute.class);
+ pq.add(new org.apache.lucene.index.Term(this.field, term.toString()), pos++);
+ };
+ ts.close();
+ }
+ catch (IOException ie) {
+ System.err.println(ie);
+ return null;
+ };
+ reader.close();
+ return new QueryWrapperFilter(pq);
+ };
+
+
+ public String toString () {
+ Filter filter = this.toFilter();
+ if (filter == null)
+ return "";
+ return filter.toString();
+ };
+
+
+ public boolean isNegative () {
+ return this.isNegative;
+ };
+
+
+ public CollectionBuilder.Interface not () {
+ this.isNegative = true;
+ return this;
+ };
+ };
+
+
public class Group implements CollectionBuilder.Interface {
private boolean isOptional = false;
private boolean isNegative = true;
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index 8108b30..4b5ae02 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -420,6 +420,23 @@
assertEquals(3, kcn.docCount());
};
+ @Test
+ public void testIndexWithTextStringQueries () throws IOException {
+ ki = new KrillIndex();
+ ki.addDoc(createDoc1());
+ ki.commit();
+
+ CollectionBuilder cb = new CollectionBuilder();
+ KrillCollection kcn = new KrillCollection(ki);
+
+ // Simple string tests
+ kcn.fromBuilder(cb.text("text", "Der alte Mann"));
+
+ // Uses german analyzer for the moment
+ assertEquals(kcn.toString(), "QueryWrapperFilter(text:\"alt mann\")");
+ // assertEquals(3, kcn.docCount());
+ };
+
@Test
public void filterExampleFromLegacy () throws Exception {