Index text fields as text fields and query them with phrase queries
Change-Id: Ifdc4db4185f191f9e79065351816750347e1f7ac
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index 5d405eb..2afe0aa 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
@@ -227,19 +227,17 @@
return this.cb.term(key, json.get("value").asText())
.not();
- // TODO:
- // This needs to change - but for now it means the elements are lowercased
case "match:contains":
- return this.cb.term(key,
+ return this.cb.text(key,
json.get("value").asText().toLowerCase());
case "match:containsnot":
- return this.cb.term(key,
+ return this.cb.text(key,
json.get("value").asText().toLowerCase()).not();
// <LEGACY>
case "match:excludes":
- return this.cb.term(key,
+ return this.cb.text(key,
json.get("value").asText().toLowerCase()).not();
// </LEGACY>
};
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
index b688aea..7dd3816 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
@@ -8,11 +8,11 @@
import org.apache.lucene.queries.TermsFilter;
import org.apache.lucene.search.*;
import org.apache.lucene.search.NumericRangeFilter;
-import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import de.ids_mannheim.korap.util.KrillDate;
+import de.ids_mannheim.korap.index.TextAnalyzer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -216,7 +216,7 @@
// does too mzch, I guess.
public Filter toFilter () {
StringReader reader = new StringReader(this.text);
- GermanAnalyzer ga = new GermanAnalyzer();
+ TextAnalyzer ga = new TextAnalyzer();
PhraseQuery pq = new PhraseQuery();
int pos = 0;
try {
diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index b7cf517..54dc6bb 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
@@ -20,7 +20,11 @@
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.analysis.TokenStream;
+
import java.util.*;
+import java.io.StringReader;
+import java.io.IOException;
/*
TODO: Store primary data at base/cons field.
@@ -50,6 +54,9 @@
public class FieldDocument extends AbstractDocument {
ObjectMapper mapper = new ObjectMapper();
+ @JsonIgnore
+ private TextAnalyzer analyzer = new TextAnalyzer();
+
@JsonIgnore
public Document doc = new Document();
private FieldType tvField = new FieldType(TextField.TYPE_STORED);
@@ -92,7 +99,14 @@
public void addText (String key, String value) {
- doc.add(new TextField(key, value, Field.Store.YES));
+ Field textField = new Field(key, value, tvField);
+ try {
+ textField.tokenStream(this.analyzer, null);
+ doc.add(textField);
+ }
+ catch (IOException io) {
+ System.err.println(io);
+ };
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
index 205b201..dd6abb2 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
@@ -8,6 +8,14 @@
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import java.io.Reader;
+/*
+ * TODO:
+ * Prepend each term with a special marker like '~' and
+ * Prepend the tokenstream with a verbatim representation.
+ * That way it's possible to search by term verbatim and by text
+ * with a phrasequery!
+ */
+
public class TextAnalyzer extends Analyzer {
@Override
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index 4b5ae02..fe796fc 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -304,7 +304,6 @@
ki = new KrillIndex();
FieldDocument fd = ki.addDoc(createDoc1());
ki.commit();
-
Analyzer ana = new TextAnalyzer();
TokenStream ts = fd.doc.getField("text").tokenStream(ana, null);
@@ -409,7 +408,10 @@
kcn.fromBuilder(cb.re("author", "Frank|Peter"));
assertEquals(2, kcn.docCount());
- // "Frau" doesn't work!
+ // "Frau" requires text request!
+ kcn.fromBuilder(cb.text("text", "Frau"));
+ assertEquals(1, kcn.docCount());
+
kcn.fromBuilder(cb.term("text", "frau"));
assertEquals(1, kcn.docCount());
@@ -429,12 +431,18 @@
CollectionBuilder cb = new CollectionBuilder();
KrillCollection kcn = new KrillCollection(ki);
+ kcn.fromBuilder(cb.term("text", "mann"));
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(cb.text("text", "Mann"));
+ assertEquals(1, kcn.docCount());
+
// Simple string tests
kcn.fromBuilder(cb.text("text", "Der alte Mann"));
- // Uses german analyzer for the moment
- assertEquals(kcn.toString(), "QueryWrapperFilter(text:\"alt mann\")");
- // assertEquals(3, kcn.docCount());
+ // Uses german analyzer for the createDocument
+ assertEquals(kcn.toString(), "QueryWrapperFilter(text:\"der alte mann\")");
+ assertEquals(1, kcn.docCount());
};
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
index 95f8073..ad69ffc 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
@@ -101,7 +101,8 @@
assertFalse(ks.hasErrors());
assertFalse(ks.hasWarnings());
assertFalse(ks.hasMessages());
- assertEquals("-author:goethe", ks.getCollection().toString());
+ assertEquals("-QueryWrapperFilter(author:\"goethe\")",
+ ks.getCollection().toString());
};