Index text fields as text fields and query them with phrase queries

Change-Id: Ifdc4db4185f191f9e79065351816750347e1f7ac
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index 5d405eb..2afe0aa 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
@@ -227,19 +227,17 @@
                         return this.cb.term(key, json.get("value").asText())
                                 .not();
 
-					// TODO:
-					// This needs to change - but for now it means the elements are lowercased
                     case "match:contains":
-                        return this.cb.term(key,
+                        return this.cb.text(key,
                                 json.get("value").asText().toLowerCase());
 
                     case "match:containsnot":
-                        return this.cb.term(key,
+                        return this.cb.text(key,
                                 json.get("value").asText().toLowerCase()).not();
 
                     // <LEGACY>
                     case "match:excludes":
-                        return this.cb.term(key,
+                        return this.cb.text(key,
                                 json.get("value").asText().toLowerCase()).not();
                     // </LEGACY>
                 };
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
index b688aea..7dd3816 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
@@ -8,11 +8,11 @@
 import org.apache.lucene.queries.TermsFilter;
 import org.apache.lucene.search.*;
 import org.apache.lucene.search.NumericRangeFilter;
-import org.apache.lucene.analysis.de.GermanAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 
 import de.ids_mannheim.korap.util.KrillDate;
+import de.ids_mannheim.korap.index.TextAnalyzer;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -216,7 +216,7 @@
 		//    does too mzch, I guess.
         public Filter toFilter () {
 			StringReader reader = new StringReader(this.text);
-			GermanAnalyzer ga = new GermanAnalyzer();
+			TextAnalyzer ga = new TextAnalyzer();
 			PhraseQuery pq = new PhraseQuery();
 			int pos = 0;
 			try {
diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index b7cf517..54dc6bb 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
@@ -20,7 +20,11 @@
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.index.IndexOptions;
 
+import org.apache.lucene.analysis.TokenStream;
+
 import java.util.*;
+import java.io.StringReader;
+import java.io.IOException;
 
 /*
   TODO: Store primary data at base/cons field.
@@ -50,6 +54,9 @@
 public class FieldDocument extends AbstractDocument {
     ObjectMapper mapper = new ObjectMapper();
 
+	@JsonIgnore
+	private TextAnalyzer analyzer = new TextAnalyzer();
+	
     @JsonIgnore
     public Document doc = new Document();
     private FieldType tvField = new FieldType(TextField.TYPE_STORED);
@@ -92,7 +99,14 @@
 
 
     public void addText (String key, String value) {
-        doc.add(new TextField(key, value, Field.Store.YES));
+		Field textField = new Field(key, value, tvField);
+		try {
+			textField.tokenStream(this.analyzer, null);
+			doc.add(textField);
+		}
+		catch (IOException io) {
+			System.err.println(io);
+		};
     };
 
 
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
index 205b201..dd6abb2 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
@@ -8,6 +8,14 @@
 import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
 import java.io.Reader;
 
+/*
+ * TODO:
+ *   Prepend each term with a special marker like '~' and
+ *   Prepend the tokenstream with a verbatim representation.
+ *   That way it's possible to search by term verbatim and by text
+ *   with a phrasequery!
+ */
+
 public class TextAnalyzer extends Analyzer {
 
     @Override
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index 4b5ae02..fe796fc 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -304,7 +304,6 @@
         ki = new KrillIndex();
         FieldDocument fd = ki.addDoc(createDoc1());
         ki.commit();
-
         Analyzer ana = new TextAnalyzer();
         TokenStream ts = fd.doc.getField("text").tokenStream(ana, null);
 
@@ -409,7 +408,10 @@
         kcn.fromBuilder(cb.re("author", "Frank|Peter"));
         assertEquals(2, kcn.docCount());
 
-        // "Frau" doesn't work!
+		// "Frau" requires text request!
+		kcn.fromBuilder(cb.text("text", "Frau"));
+        assertEquals(1, kcn.docCount());
+
         kcn.fromBuilder(cb.term("text", "frau"));
         assertEquals(1, kcn.docCount());
 
@@ -429,12 +431,18 @@
         CollectionBuilder cb = new CollectionBuilder();
         KrillCollection kcn = new KrillCollection(ki);
 
+        kcn.fromBuilder(cb.term("text", "mann"));
+        assertEquals(1, kcn.docCount());
+
+		kcn.fromBuilder(cb.text("text", "Mann"));
+        assertEquals(1, kcn.docCount());
+
 		// Simple string tests
         kcn.fromBuilder(cb.text("text", "Der alte Mann"));
 
-		// Uses german analyzer for the moment
-		assertEquals(kcn.toString(), "QueryWrapperFilter(text:\"alt mann\")");
-        // assertEquals(3, kcn.docCount());
+		// Uses german analyzer for the createDocument
+		assertEquals(kcn.toString(), "QueryWrapperFilter(text:\"der alte mann\")");
+		assertEquals(1, kcn.docCount());
 	};
 
 
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
index 95f8073..ad69ffc 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
@@ -101,7 +101,8 @@
         assertFalse(ks.hasErrors());
         assertFalse(ks.hasWarnings());
         assertFalse(ks.hasMessages());
-        assertEquals("-author:goethe", ks.getCollection().toString());
+        assertEquals("-QueryWrapperFilter(author:\"goethe\")",
+					 ks.getCollection().toString());
     };