Fixed case insensitivity in filters Change-Id: I702734c9884a17f09ddfb4f9677b85f0eea2681b

commit: 484c3c1fb188c9b47dc5b8ba9c68e79f305e417b [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Jul 07 20:25:44 2015 +0200
committer: Akron <nils@diewald-online.de> Tue Jul 07 20:25:44 2015 +0200
tree: abc4c69cde1b5ba99308af072955e164e3413fcb
parent: 6590c327cbc2b72a337f2b6350a3fc320273fc3e [diff]
diff --git a/Changes b/Changes
index 9a7ca91..1bf1d78 100644
--- a/Changes
+++ b/Changes

@@ -25,6 +25,7 @@
 	  parsing failures (diewald)
 	- [feature] Support '@all' as a 'fields' value for all meta
 	  data fields (diewald)
+	- [bugfix] Fix case sensitivity bug in filters (diewald)
 
 0.51 2015-03-17
         - This is a major version (prepared for the GitHub release)

diff --git a/pom.xml b/pom.xml
index 3c6da01..e6d8a61 100644
--- a/pom.xml
+++ b/pom.xml

@@ -212,6 +212,12 @@
     </dependency>
     -->
 
+    <!-- Some language extensions like StringUtil -->
+    <dependency>
+      <groupId>commons-lang</groupId>
+      <artifactId>commons-lang</artifactId>
+      <version>2.3</version>
+    </dependency>
   </dependencies>
 
   <build>

diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index 6fe28c9..83e40fa 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java

@@ -67,7 +67,7 @@
             .getLogger(KrillCollection.class);
 
     // This advices the java compiler to ignore all loggings
-    public static final boolean DEBUG = false;
+    public static final boolean DEBUG = true;
 
 
     /**
@@ -235,8 +235,14 @@
                 else if (match.equals("match:ne")) {
                     bfilter.andNot(key, json.get("value").asText());
                 }
+                // This may change - but for now it means the elements are lowercased
+                else if (match.equals("match:contains")) {
+                    bfilter.and(key, json.get("value").asText().toLowerCase());
+                }
+                else if (match.equals("match:excludes")) {
+                    bfilter.andNot(key, json.get("value").asText().toLowerCase());
+                }
                 else {
-                    // TODO!
                     throw new QueryException(0, "Unknown match type");
                 };
 

diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 7d27c17..02597d1 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java

@@ -15,9 +15,12 @@
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.store.*;
 import org.apache.lucene.analysis.Analyzer;
+/*
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+*/
 import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.util.*;
 import org.apache.lucene.util.automaton.*;
 
@@ -209,14 +212,14 @@
         this.directory = directory;
 
         // Add analyzers
-        // TODO: Should probably not be here
+        // TODO: Should probably not be here - make configurable
         Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
-        analyzerPerField.put("textClass", new WhitespaceAnalyzer(
-                Version.LUCENE_CURRENT));
-        analyzerPerField.put("foundries", new WhitespaceAnalyzer(
-                Version.LUCENE_CURRENT));
+        analyzerPerField.put("textClass", new KeywordAnalyzer());
+        analyzerPerField.put("keywords", new KeywordAnalyzer());
+        analyzerPerField.put("foundries", new KeywordAnalyzer());
         PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
-                new StandardAnalyzer(Version.LUCENE_CURRENT), analyzerPerField);
+          new TextAnalyzer(), analyzerPerField
+        );
 
         // Create configuration with base analyzer
         this.config = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer);
@@ -1469,6 +1472,18 @@
         return kr;
     };
 
+    public void getFields () {
+        /*
+         * Return a map of key, value pairs:
+         *
+         * keywords => keywords (contains)
+         * author => text (contains)
+         */
+    };
+
+    public void getValues (String field) {
+
+    };
 
     // Collect matches
     public MatchCollector collect (Krill ks, MatchCollector mc) {

diff --git a/src/main/java/de/ids_mannheim/korap/collection/RegexFilter.java b/src/main/java/de/ids_mannheim/korap/collection/RegexFilter.java
index f9d881c..c948dab 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/RegexFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/RegexFilter.java

@@ -9,7 +9,7 @@
  * @author Nils Diewald
  * 
  *         RegexFilter implements a helper object for
- *         regular expressions used in KorapFilter
+ *         regular expressions used in KrillCollection
  *         constraints.
  */
 

diff --git a/src/main/java/de/ids_mannheim/korap/index/KeywordAnalyzer.java b/src/main/java/de/ids_mannheim/korap/index/KeywordAnalyzer.java
new file mode 100644
index 0000000..e20af6f
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/KeywordAnalyzer.java

@@ -0,0 +1,19 @@
+package de.ids_mannheim.korap.index;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
+import java.io.Reader;
+
+public class KeywordAnalyzer extends Analyzer {
+
+  @Override
+  protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
+    final Tokenizer source = new WhitespaceTokenizer(reader);
+    TokenStream sink = new LowerCaseFilter(source);
+    return new TokenStreamComponents(source, sink);
+  };
+};

diff --git a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
new file mode 100644
index 0000000..bd5181b
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java

@@ -0,0 +1,19 @@
+package de.ids_mannheim.korap.index;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
+import java.io.Reader;
+
+public class TextAnalyzer extends Analyzer {
+
+  @Override
+  protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
+    final Tokenizer source = new StandardTokenizer(reader);
+    TokenStream sink = new LowerCaseFilter(source);
+    return new TokenStreamComponents(source, sink);
+  };
+};

diff --git a/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java b/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
index 618218f..d0862a4 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java

@@ -17,6 +17,11 @@
 import java.nio.charset.StandardCharsets;
 import java.nio.ByteBuffer;
 
+import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.JsonNode;
 
@@ -210,6 +215,96 @@
                 .asText());
     };
 
+    @Test
+    public void searchCollectionFields () throws IOException {
+        KrillIndex ki = new KrillIndex();
+        FieldDocument fd = new FieldDocument();
+        fd.addString("corpusSigle", "ABC");
+        fd.addString("docSigle", "ABC-123");
+        fd.addString("textSigle", "ABC-123-0001");
+        fd.addText("title", "Die Wahlverwandschaften");
+        fd.addText("author", "Johann Wolfgang von Goethe");
+        fd.addKeyword("textClass", "reisen wissenschaft");
+        fd.addInt("pubDate", 20130617);
+        fd.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]"
+                + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
+        ki.addDoc(fd);
+
+        FieldDocument fd2 = new FieldDocument();
+        fd2.addString("corpusSigle", "ABC");
+        fd2.addString("docSigle", "ABC-125");
+        fd2.addString("textSigle", "ABC-125-0001");
+        fd2.addText("title", "Die Glocke");
+        fd2.addText("author", "Schiller, Friedrich");
+        fd2.addKeyword("textClass", "Reisen geschichte");
+        fd2.addInt("pubDate", 20130203);
+        fd2.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]"
+                + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
+        ki.addDoc(fd2);
+        ki.commit();
+
+        // textClass = reisen & wissenschaft
+        String jsonString = getString(getClass().getResource(
+        "/queries/collections/collection_textClass.jsonld").getFile());
+        Krill ks = new Krill(jsonString);
+        KrillCollection kc = ks.getCollection();
+        kc.setIndex(ki);
+        assertEquals(1, kc.getCount()); // 1 filter operation
+        assertEquals(1, kc.numberOf("documents"));
+
+        // textClass = reisen
+        jsonString = getString(getClass().getResource(
+        "/queries/collections/collection_textClass_2.jsonld").getFile());
+        ks = new Krill(jsonString);
+        kc = ks.getCollection();
+        kc.setIndex(ki);
+        assertEquals(1, kc.getCount()); // 1 filter operation
+        assertEquals(2, kc.numberOf("documents"));
+
+        /*
+        System.err.println(StringUtils.join(fd2.doc.getValues("textClass"), ","));
+        System.err.println(StringUtils.join(fd2.doc.getValues("author"), ", "));
+        */
+        /*
+        TokenStream ts = fd2.doc.getField("author").tokenStream(
+            (Analyzer) ki.writer().getAnalyzer(),
+            (TokenStream) null
+                                                                  );
+        // OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
+        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
+
+        ts.reset();
+        while (ts.incrementToken()) {
+            String term = charTermAttribute.toString();
+            System.err.println(">>" + term + "<<");
+        };
+        */
+
+        // author = wolfgang
+        jsonString = getString(getClass().getResource(
+        "/queries/collections/collection_goethe.jsonld").getFile());
+        ks = new Krill(jsonString);
+        kc = ks.getCollection();
+        kc.setIndex(ki);
+        assertEquals(1, kc.getCount()); // 1 filter operation
+        assertEquals(1, kc.numberOf("documents"));
+
+        // author = Wolfgang
+        jsonString = getString(getClass().getResource(
+        "/queries/collections/collection_goethe_2.jsonld").getFile());
+        ks = new Krill(jsonString);
+        kc = ks.getCollection();
+        kc.setIndex(ki);
+        assertEquals(1, kc.getCount()); // 1 filter operation
+        assertEquals(1, kc.numberOf("documents"));
+
+        Result kr = ks.apply(ki);
+        
+        ObjectMapper mapper = new ObjectMapper();
+        JsonNode res = mapper.readTree(kr.toJsonString());
+        assertEquals(1, res.at("/meta/totalResults").asInt());
+    };
+
 
     @Test
     public void searchMetaContext () throws IOException {

diff --git a/src/test/resources/queries/collections/collection_goethe.jsonld b/src/test/resources/queries/collections/collection_goethe.jsonld
new file mode 100644
index 0000000..3a6449d
--- /dev/null
+++ b/src/test/resources/queries/collections/collection_goethe.jsonld

@@ -0,0 +1,24 @@
+{
+  "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+  "collection" : {
+    "@type" : "koral:doc",
+    "key" : "author",
+    "type" : "type:string",
+    "value" : "wolfgang",
+    "match" : "match:contains" 
+  },
+  "meta" : {
+    "fields":["textSigle","author", "title"],
+    "count":2
+  },
+  "query" : {
+    "@type" : "koral:token",
+    "wrap" : {
+      "@type" : "koral:term",
+      "key" : "a",
+      "layer" : "orth",
+      "match" : "match:eq"
+    }
+  },
+  "warnings" : []
+}

diff --git a/src/test/resources/queries/collections/collection_goethe_2.jsonld b/src/test/resources/queries/collections/collection_goethe_2.jsonld
new file mode 100644
index 0000000..9644e99
--- /dev/null
+++ b/src/test/resources/queries/collections/collection_goethe_2.jsonld

@@ -0,0 +1,24 @@
+{
+  "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+  "collection" : {
+    "@type" : "koral:doc",
+    "key" : "author",
+    "type" : "type:string",
+    "value" : "Wolfgang",
+    "match" : "match:contains" 
+  },
+  "meta" : {
+    "fields":["textSigle","author", "title"],
+    "count":2
+  },
+  "query" : {
+    "@type" : "koral:token",
+    "wrap" : {
+      "@type" : "koral:term",
+      "key" : "a",
+      "layer" : "orth",
+      "match" : "match:eq"
+    }
+  },
+  "warnings" : []
+}

diff --git a/src/test/resources/queries/collections/collection_textClass.jsonld b/src/test/resources/queries/collections/collection_textClass.jsonld
new file mode 100644
index 0000000..faf61ec
--- /dev/null
+++ b/src/test/resources/queries/collections/collection_textClass.jsonld

@@ -0,0 +1,35 @@
+{
+  "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+  "collection" : {
+    "operands": [
+      {
+        "value": "wissenschaft",
+        "match": "match:eq",
+        "key": "textClass",
+        "@type": "koral:doc"
+      },
+      {
+        "value": "reisen",
+        "match": "match:eq",
+        "key": "textClass",
+        "@type": "koral:doc"
+      }
+    ],
+    "operation": "operation:and",
+    "@type": "koral:docGroup"
+  },
+  "meta" : {
+    "fields":["textSigle","author", "title"],
+    "count":2
+  },
+  "query" : {
+    "@type" : "koral:token",
+    "wrap" : {
+      "@type" : "koral:term",
+      "key" : "a",
+      "layer" : "orth",
+      "match" : "match:eq"
+    }
+  },
+  "warnings" : []
+}

diff --git a/src/test/resources/queries/collections/collection_textClass_2.jsonld b/src/test/resources/queries/collections/collection_textClass_2.jsonld
new file mode 100644
index 0000000..6549e97
--- /dev/null
+++ b/src/test/resources/queries/collections/collection_textClass_2.jsonld

@@ -0,0 +1,23 @@
+{
+  "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+  "collection" : {
+    "value": "reisen",
+    "match": "match:contains",
+    "key": "textClass",
+    "@type": "koral:doc"
+  },
+  "meta" : {
+    "fields":["textSigle","author", "title"],
+    "count":2
+  },
+  "query" : {
+    "@type" : "koral:token",
+    "wrap" : {
+      "@type" : "koral:term",
+      "key" : "a",
+      "layer" : "orth",
+      "match" : "match:eq"
+    }
+  },
+  "warnings" : []
+}
commit	484c3c1fb188c9b47dc5b8ba9c68e79f305e417b	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Jul 07 20:25:44 2015 +0200
committer	Akron <nils@diewald-online.de>	Tue Jul 07 20:25:44 2015 +0200
tree	abc4c69cde1b5ba99308af072955e164e3413fcb
parent	6590c327cbc2b72a337f2b6350a3fc320273fc3e [diff]