Fixed case insensitivity in filters
Change-Id: I702734c9884a17f09ddfb4f9677b85f0eea2681b
diff --git a/Changes b/Changes
index 9a7ca91..1bf1d78 100644
--- a/Changes
+++ b/Changes
@@ -25,6 +25,7 @@
parsing failures (diewald)
- [feature] Support '@all' as a 'fields' value for all meta
data fields (diewald)
+ - [bugfix] Fix case sensitivity bug in filters (diewald)
0.51 2015-03-17
- This is a major version (prepared for the GitHub release)
diff --git a/pom.xml b/pom.xml
index 3c6da01..e6d8a61 100644
--- a/pom.xml
+++ b/pom.xml
@@ -212,6 +212,12 @@
</dependency>
-->
+ <!-- Some language extensions like StringUtil -->
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ <version>2.3</version>
+ </dependency>
</dependencies>
<build>
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index 6fe28c9..83e40fa 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
@@ -67,7 +67,7 @@
.getLogger(KrillCollection.class);
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = false;
+ public static final boolean DEBUG = true;
/**
@@ -235,8 +235,14 @@
else if (match.equals("match:ne")) {
bfilter.andNot(key, json.get("value").asText());
}
+ // This may change - but for now it means the elements are lowercased
+ else if (match.equals("match:contains")) {
+ bfilter.and(key, json.get("value").asText().toLowerCase());
+ }
+ else if (match.equals("match:excludes")) {
+ bfilter.andNot(key, json.get("value").asText().toLowerCase());
+ }
else {
- // TODO!
throw new QueryException(0, "Unknown match type");
};
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 7d27c17..02597d1 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -15,9 +15,12 @@
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.*;
import org.apache.lucene.analysis.Analyzer;
+/*
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+*/
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.*;
import org.apache.lucene.util.automaton.*;
@@ -209,14 +212,14 @@
this.directory = directory;
// Add analyzers
- // TODO: Should probably not be here
+ // TODO: Should probably not be here - make configurable
Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
- analyzerPerField.put("textClass", new WhitespaceAnalyzer(
- Version.LUCENE_CURRENT));
- analyzerPerField.put("foundries", new WhitespaceAnalyzer(
- Version.LUCENE_CURRENT));
+ analyzerPerField.put("textClass", new KeywordAnalyzer());
+ analyzerPerField.put("keywords", new KeywordAnalyzer());
+ analyzerPerField.put("foundries", new KeywordAnalyzer());
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
- new StandardAnalyzer(Version.LUCENE_CURRENT), analyzerPerField);
+ new TextAnalyzer(), analyzerPerField
+ );
// Create configuration with base analyzer
this.config = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer);
@@ -1469,6 +1472,18 @@
return kr;
};
+ public void getFields () {
+ /*
+ * Return a map of key, value pairs:
+ *
+ * keywords => keywords (contains)
+ * author => text (contains)
+ */
+ };
+
+ public void getValues (String field) {
+
+ };
// Collect matches
public MatchCollector collect (Krill ks, MatchCollector mc) {
diff --git a/src/main/java/de/ids_mannheim/korap/collection/RegexFilter.java b/src/main/java/de/ids_mannheim/korap/collection/RegexFilter.java
index f9d881c..c948dab 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/RegexFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/RegexFilter.java
@@ -9,7 +9,7 @@
* @author Nils Diewald
*
* RegexFilter implements a helper object for
- * regular expressions used in KorapFilter
+ * regular expressions used in KrillCollection
* constraints.
*/
diff --git a/src/main/java/de/ids_mannheim/korap/index/KeywordAnalyzer.java b/src/main/java/de/ids_mannheim/korap/index/KeywordAnalyzer.java
new file mode 100644
index 0000000..e20af6f
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/KeywordAnalyzer.java
@@ -0,0 +1,19 @@
+package de.ids_mannheim.korap.index;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
+import java.io.Reader;
+
+public class KeywordAnalyzer extends Analyzer {
+
+ @Override
+ protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
+ final Tokenizer source = new WhitespaceTokenizer(reader);
+ TokenStream sink = new LowerCaseFilter(source);
+ return new TokenStreamComponents(source, sink);
+ };
+};
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
new file mode 100644
index 0000000..bd5181b
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
@@ -0,0 +1,19 @@
+package de.ids_mannheim.korap.index;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
+import java.io.Reader;
+
+public class TextAnalyzer extends Analyzer {
+
+ @Override
+ protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
+ final Tokenizer source = new StandardTokenizer(reader);
+ TokenStream sink = new LowerCaseFilter(source);
+ return new TokenStreamComponents(source, sink);
+ };
+};
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java b/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
index 618218f..d0862a4 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
@@ -17,6 +17,11 @@
import java.nio.charset.StandardCharsets;
import java.nio.ByteBuffer;
+import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
@@ -210,6 +215,96 @@
.asText());
};
+ @Test
+ public void searchCollectionFields () throws IOException {
+ KrillIndex ki = new KrillIndex();
+ FieldDocument fd = new FieldDocument();
+ fd.addString("corpusSigle", "ABC");
+ fd.addString("docSigle", "ABC-123");
+ fd.addString("textSigle", "ABC-123-0001");
+ fd.addText("title", "Die Wahlverwandschaften");
+ fd.addText("author", "Johann Wolfgang von Goethe");
+ fd.addKeyword("textClass", "reisen wissenschaft");
+ fd.addInt("pubDate", 20130617);
+ fd.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]"
+ + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
+ ki.addDoc(fd);
+
+ FieldDocument fd2 = new FieldDocument();
+ fd2.addString("corpusSigle", "ABC");
+ fd2.addString("docSigle", "ABC-125");
+ fd2.addString("textSigle", "ABC-125-0001");
+ fd2.addText("title", "Die Glocke");
+ fd2.addText("author", "Schiller, Friedrich");
+ fd2.addKeyword("textClass", "Reisen geschichte");
+ fd2.addInt("pubDate", 20130203);
+ fd2.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]"
+ + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
+ ki.addDoc(fd2);
+ ki.commit();
+
+ // textClass = reisen & wissenschaft
+ String jsonString = getString(getClass().getResource(
+ "/queries/collections/collection_textClass.jsonld").getFile());
+ Krill ks = new Krill(jsonString);
+ KrillCollection kc = ks.getCollection();
+ kc.setIndex(ki);
+ assertEquals(1, kc.getCount()); // 1 filter operation
+ assertEquals(1, kc.numberOf("documents"));
+
+ // textClass = reisen
+ jsonString = getString(getClass().getResource(
+ "/queries/collections/collection_textClass_2.jsonld").getFile());
+ ks = new Krill(jsonString);
+ kc = ks.getCollection();
+ kc.setIndex(ki);
+ assertEquals(1, kc.getCount()); // 1 filter operation
+ assertEquals(2, kc.numberOf("documents"));
+
+ /*
+ System.err.println(StringUtils.join(fd2.doc.getValues("textClass"), ","));
+ System.err.println(StringUtils.join(fd2.doc.getValues("author"), ", "));
+ */
+ /*
+ TokenStream ts = fd2.doc.getField("author").tokenStream(
+ (Analyzer) ki.writer().getAnalyzer(),
+ (TokenStream) null
+ );
+ // OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
+ CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
+
+ ts.reset();
+ while (ts.incrementToken()) {
+ String term = charTermAttribute.toString();
+ System.err.println(">>" + term + "<<");
+ };
+ */
+
+ // author = wolfgang
+ jsonString = getString(getClass().getResource(
+ "/queries/collections/collection_goethe.jsonld").getFile());
+ ks = new Krill(jsonString);
+ kc = ks.getCollection();
+ kc.setIndex(ki);
+ assertEquals(1, kc.getCount()); // 1 filter operation
+ assertEquals(1, kc.numberOf("documents"));
+
+ // author = Wolfgang
+ jsonString = getString(getClass().getResource(
+ "/queries/collections/collection_goethe_2.jsonld").getFile());
+ ks = new Krill(jsonString);
+ kc = ks.getCollection();
+ kc.setIndex(ki);
+ assertEquals(1, kc.getCount()); // 1 filter operation
+ assertEquals(1, kc.numberOf("documents"));
+
+ Result kr = ks.apply(ki);
+
+ ObjectMapper mapper = new ObjectMapper();
+ JsonNode res = mapper.readTree(kr.toJsonString());
+ assertEquals(1, res.at("/meta/totalResults").asInt());
+ };
+
@Test
public void searchMetaContext () throws IOException {
diff --git a/src/test/resources/queries/collections/collection_goethe.jsonld b/src/test/resources/queries/collections/collection_goethe.jsonld
new file mode 100644
index 0000000..3a6449d
--- /dev/null
+++ b/src/test/resources/queries/collections/collection_goethe.jsonld
@@ -0,0 +1,24 @@
+{
+ "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+ "collection" : {
+ "@type" : "koral:doc",
+ "key" : "author",
+ "type" : "type:string",
+ "value" : "wolfgang",
+ "match" : "match:contains"
+ },
+ "meta" : {
+ "fields":["textSigle","author", "title"],
+ "count":2
+ },
+ "query" : {
+ "@type" : "koral:token",
+ "wrap" : {
+ "@type" : "koral:term",
+ "key" : "a",
+ "layer" : "orth",
+ "match" : "match:eq"
+ }
+ },
+ "warnings" : []
+}
diff --git a/src/test/resources/queries/collections/collection_goethe_2.jsonld b/src/test/resources/queries/collections/collection_goethe_2.jsonld
new file mode 100644
index 0000000..9644e99
--- /dev/null
+++ b/src/test/resources/queries/collections/collection_goethe_2.jsonld
@@ -0,0 +1,24 @@
+{
+ "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+ "collection" : {
+ "@type" : "koral:doc",
+ "key" : "author",
+ "type" : "type:string",
+ "value" : "Wolfgang",
+ "match" : "match:contains"
+ },
+ "meta" : {
+ "fields":["textSigle","author", "title"],
+ "count":2
+ },
+ "query" : {
+ "@type" : "koral:token",
+ "wrap" : {
+ "@type" : "koral:term",
+ "key" : "a",
+ "layer" : "orth",
+ "match" : "match:eq"
+ }
+ },
+ "warnings" : []
+}
diff --git a/src/test/resources/queries/collections/collection_textClass.jsonld b/src/test/resources/queries/collections/collection_textClass.jsonld
new file mode 100644
index 0000000..faf61ec
--- /dev/null
+++ b/src/test/resources/queries/collections/collection_textClass.jsonld
@@ -0,0 +1,35 @@
+{
+ "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+ "collection" : {
+ "operands": [
+ {
+ "value": "wissenschaft",
+ "match": "match:eq",
+ "key": "textClass",
+ "@type": "koral:doc"
+ },
+ {
+ "value": "reisen",
+ "match": "match:eq",
+ "key": "textClass",
+ "@type": "koral:doc"
+ }
+ ],
+ "operation": "operation:and",
+ "@type": "koral:docGroup"
+ },
+ "meta" : {
+ "fields":["textSigle","author", "title"],
+ "count":2
+ },
+ "query" : {
+ "@type" : "koral:token",
+ "wrap" : {
+ "@type" : "koral:term",
+ "key" : "a",
+ "layer" : "orth",
+ "match" : "match:eq"
+ }
+ },
+ "warnings" : []
+}
diff --git a/src/test/resources/queries/collections/collection_textClass_2.jsonld b/src/test/resources/queries/collections/collection_textClass_2.jsonld
new file mode 100644
index 0000000..6549e97
--- /dev/null
+++ b/src/test/resources/queries/collections/collection_textClass_2.jsonld
@@ -0,0 +1,23 @@
+{
+ "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+ "collection" : {
+ "value": "reisen",
+ "match": "match:contains",
+ "key": "textClass",
+ "@type": "koral:doc"
+ },
+ "meta" : {
+ "fields":["textSigle","author", "title"],
+ "count":2
+ },
+ "query" : {
+ "@type" : "koral:token",
+ "wrap" : {
+ "@type" : "koral:term",
+ "key" : "a",
+ "layer" : "orth",
+ "match" : "match:eq"
+ }
+ },
+ "warnings" : []
+}