Support text and string queries on text metadata fields
Change-Id: I2c3e406f071828df6b6757a9101892a2739d316f
diff --git a/Changes b/Changes
index 7ed178a..0970e7d 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.57 2018-03-28
+0.57 2018-04-04
- [feature] Support text queries in metadata (diewald)
0.56.2 2018-03-23
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
index 7dd3816..6aeca37 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
@@ -12,7 +12,7 @@
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import de.ids_mannheim.korap.util.KrillDate;
-import de.ids_mannheim.korap.index.TextAnalyzer;
+import de.ids_mannheim.korap.index.TextPrependedTokenStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -215,25 +215,24 @@
// Currently this treatment is language specific and
// does too mzch, I guess.
public Filter toFilter () {
- StringReader reader = new StringReader(this.text);
- TextAnalyzer ga = new TextAnalyzer();
PhraseQuery pq = new PhraseQuery();
int pos = 0;
try {
- TokenStream ts = ga.tokenStream(this.field , reader);
+ TextPrependedTokenStream tpts = new TextPrependedTokenStream(this.text);
+ tpts.doNotPrepend();
CharTermAttribute term;
- ts.reset();
- while (ts.incrementToken()) {
- term = ts.getAttribute(CharTermAttribute.class);
+ tpts.reset();
+ while (tpts.incrementToken()) {
+ term = tpts.getAttribute(CharTermAttribute.class);
pq.add(new org.apache.lucene.index.Term(this.field, term.toString()), pos++);
};
- ts.close();
+ tpts.close();
}
catch (IOException ie) {
System.err.println(ie);
return null;
};
- reader.close();
+
return new QueryWrapperFilter(pq);
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index 54dc6bb..95c65a2 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
@@ -53,9 +53,6 @@
@JsonIgnoreProperties(ignoreUnknown = true)
public class FieldDocument extends AbstractDocument {
ObjectMapper mapper = new ObjectMapper();
-
- @JsonIgnore
- private TextAnalyzer analyzer = new TextAnalyzer();
@JsonIgnore
public Document doc = new Document();
@@ -99,14 +96,7 @@
public void addText (String key, String value) {
- Field textField = new Field(key, value, tvField);
- try {
- textField.tokenStream(this.analyzer, null);
- doc.add(textField);
- }
- catch (IOException io) {
- System.err.println(io);
- };
+ doc.add(new TextPrependedField(key, value));
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
index 5875c3e..515c6d1 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
@@ -6,25 +6,15 @@
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
+import org.apache.commons.io.IOUtils;
+import java.io.IOException;
import java.io.Reader;
-/*
- * TODO:
- * Prepend each term with a special marker like '~' and
- * Prepend the tokenstream with a verbatim representation.
- * That way it's possible to search by term verbatim and by text
- * with a phrasequery!
- */
-
public class TextAnalyzer extends Analyzer {
- // private static String verbatim;
-
@Override
protected TokenStreamComponents createComponents (final String fieldName) {
- final Tokenizer source = new TextPrependTokenizer();
+ final Tokenizer source = new StandardTokenizer();
TokenStream sink = new LowerCaseFilter(source);
- sink = new TextTokenFilter(sink);
- // source.setVerbatim(this.verbatim);
return new TokenStreamComponents(source, sink);
};
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java b/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java
deleted file mode 100644
index 0613d2c..0000000
--- a/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java
+++ /dev/null
@@ -1,107 +0,0 @@
-package de.ids_mannheim.korap.index;
-
-// This code is pretty similar to
-// org.apache.lucene.analysis.standard.StandardTokenizer,
-// but prepends a verbatim string to the TokenStream
-
-import java.io.IOException;
-import java.io.Reader;
-
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.standard.StandardTokenizerImpl;
-import org.apache.lucene.util.AttributeFactory;
-import org.apache.lucene.util.AttributeSource;
-import org.apache.commons.io.IOUtils;
-import java.io.StringReader;
-
-
-public final class TextPrependTokenizer extends Tokenizer {
-
- /** A private instance of the JFlex-constructed scanner */
- private StandardTokenizerImpl scanner;
-
- private int skippedPositions;
-
- private String verbatim;
- private Boolean init = true;
-
- private int maxTokenLength = 1024 * 1024;
-
- public TextPrependTokenizer() {
- init();
- }
-
- public TextPrependTokenizer(AttributeFactory factory) {
- super(factory);
- init();
- }
-
- private void init() {
- /*try {
- System.err.println(IOUtils.toString(reader));
- }
- catch (IOException io) {
- System.err.println("Exception: " + io);
- };
- System.err.println(input.reset());
- */
- this.scanner = new StandardTokenizerImpl(input);
- this.init = true;
- }
-
- public void setVerbatim (String v) {
- this.verbatim = v;
- };
-
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-
- @Override
- public final boolean incrementToken() throws IOException {
- clearAttributes();
- skippedPositions = 0;
-
- if (this.init) {
- posIncrAtt.setPositionIncrement(10000);
- termAtt.append("[PREPEND]");
- this.init = false;
- return true;
- };
-
- while(true) {
- int tokenType = scanner.getNextToken();
-
- if (tokenType == StandardTokenizerImpl.YYEOF) {
- return false;
- }
-
- if (scanner.yylength() <= maxTokenLength) {
- posIncrAtt.setPositionIncrement(skippedPositions+1);
- scanner.getText(termAtt);
- return true;
- } else
- skippedPositions++;
- }
- }
-
- @Override
- public final void end() throws IOException {
- super.end();
- posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
- }
-
- @Override
- public void close() throws IOException {
- super.close();
- scanner.yyreset(input);
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- scanner.yyreset(input);
- skippedPositions = 0;
- }
-}
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextPrependedField.java b/src/main/java/de/ids_mannheim/korap/index/TextPrependedField.java
new file mode 100644
index 0000000..ea57a84
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/TextPrependedField.java
@@ -0,0 +1,28 @@
+package de.ids_mannheim.korap.index;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.TextField;
+import java.io.IOException;
+
+public final class TextPrependedField extends Field {
+
+ public static FieldType TEXT = new FieldType(TextField.TYPE_STORED);
+
+ static {
+ TEXT.setStoreTermVectors(true);
+ TEXT.setStoreTermVectorPositions(true);
+ TEXT.setStoreTermVectorPayloads(true);
+ TEXT.setStoreTermVectorOffsets(false);
+ };
+
+ public TextPrependedField(String name, String value) {
+ super(name, value, TEXT);
+ TextPrependedTokenStream tpts = new TextPrependedTokenStream(value);
+ this.setTokenStream(tpts);
+ };
+};
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextPrependedTokenStream.java b/src/main/java/de/ids_mannheim/korap/index/TextPrependedTokenStream.java
new file mode 100644
index 0000000..ea565e1
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/TextPrependedTokenStream.java
@@ -0,0 +1,77 @@
+package de.ids_mannheim.korap.index;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.analysis.standard.StandardTokenizerImpl;
+import org.apache.lucene.analysis.util.CharacterUtils;
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ * Create a tokenstream with the first token being the verbatim string.
+ * All following tokens are standardtokenized and lowercased.
+ */
+
+public class TextPrependedTokenStream extends TokenStream {
+ private final CharTermAttribute charTermAttr = this.addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncrAttr = this.addAttribute(PositionIncrementAttribute.class);
+ private final CharacterUtils charUtils = CharacterUtils.getInstance();
+ private Boolean init;
+ private String verbatim;
+ private int skippedPositions;
+
+ /** A private instance of the JFlex-constructed scanner */
+ private StandardTokenizerImpl scanner;
+ private final int maxTokenLength = 1024 * 1024;
+
+ /** Constructor */
+ public TextPrependedTokenStream (String text) {
+ this.init = true;
+ this.verbatim = text;
+ this.scanner = null;
+ };
+
+ /** Do not repeat the verbatim string at the beginning */
+ public void doNotPrepend () {
+ this.init = false;
+ };
+
+ @Override
+ public final boolean incrementToken () throws IOException {
+ clearAttributes();
+ skippedPositions = 0;
+
+ // Repeat the verbatim string at the beginning
+ if (this.init) {
+ posIncrAttr.setPositionIncrement(255);
+ charTermAttr.append(this.verbatim);
+ this.init = false;
+ return true;
+ };
+
+ // Initialize the scanner
+ if (this.scanner == null) {
+ this.scanner = new StandardTokenizerImpl(
+ new StringReader(this.verbatim)
+ );
+ };
+
+ // Increment tokens by wrapping the scanner like the StandardTokenizer
+ while(true) {
+ int tokenType = scanner.getNextToken();
+
+ if (tokenType == StandardTokenizerImpl.YYEOF) {
+ return false;
+ }
+
+ if (scanner.yylength() <= maxTokenLength) {
+ posIncrAttr.setPositionIncrement(
+ skippedPositions+1
+ );
+ scanner.getText(charTermAttr);
+ charUtils.toLowerCase(charTermAttr.buffer(), 0, charTermAttr.length());
+ return true;
+ } else
+ skippedPositions++;
+ }
+ };
+};
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index b7fba53..9a55e65 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -6,7 +6,6 @@
import de.ids_mannheim.korap.KrillCollection;
import de.ids_mannheim.korap.collection.CollectionBuilder;
import de.ids_mannheim.korap.index.FieldDocument;
-import de.ids_mannheim.korap.index.TextAnalyzer;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.response.SearchContext;
import de.ids_mannheim.korap.util.StatusCodes;
@@ -151,10 +150,10 @@
.with(cb.term("textClass", "kultur")));
assertEquals(0, kcn.docCount());
- kcn.fromBuilder(cb.term("text", "mann~"));
+ kcn.fromBuilder(cb.term("text", "mann"));
assertEquals(3, kcn.docCount());
- kcn.fromBuilder(cb.term("text", "frau~"));
+ kcn.fromBuilder(cb.term("text", "frau"));
assertEquals(1, kcn.docCount());
};
@@ -300,39 +299,6 @@
@Test
- public void testIndexStream () throws IOException {
- ki = new KrillIndex();
- FieldDocument fd = ki.addDoc(createDoc1());
- ki.commit();
- Analyzer ana = new TextAnalyzer();
- TokenStream ts = fd.doc.getField("text").tokenStream(ana, null);
-
- CharTermAttribute charTermAttribute = ts
- .addAttribute(CharTermAttribute.class);
- ts.reset();
-
- ts.incrementToken();
- assertEquals("[PREPEND2]", charTermAttribute.toString());
- ts.incrementToken();
- assertEquals("[prepend]", charTermAttribute.toString());
- ts.incrementToken();
- assertEquals("der", charTermAttribute.toString());
- ts.incrementToken();
- assertEquals("alte", charTermAttribute.toString());
- ts.incrementToken();
- assertEquals("mann", charTermAttribute.toString());
- ts.incrementToken();
- assertEquals("ging", charTermAttribute.toString());
- ts.incrementToken();
- assertEquals("über", charTermAttribute.toString());
- ts.incrementToken();
- assertEquals("die", charTermAttribute.toString());
- ts.incrementToken();
- assertEquals("straße", charTermAttribute.toString());
- };
-
-
- @Test
public void testIndexWithDateRanges () throws IOException {
ki = new KrillIndex();
ki.addDoc(createDoc1());
@@ -397,7 +363,7 @@
public void testIndexWithRegexes () throws IOException {
ki = new KrillIndex();
- ki.addDoc(createDoc1());
+ FieldDocument fd = ki.addDoc(createDoc1());
ki.addDoc(createDoc2());
ki.addDoc(createDoc3());
ki.commit();
@@ -416,15 +382,22 @@
kcn.fromBuilder(cb.text("text", "Frau"));
assertEquals(1, kcn.docCount());
- kcn.fromBuilder(cb.term("text", "frau~"));
+ kcn.fromBuilder(cb.term("text", "frau"));
assertEquals(1, kcn.docCount());
- kcn.fromBuilder(cb.re("text", "frau."));
+ kcn.fromBuilder(cb.re("text", "fra."));
assertEquals(1, kcn.docCount());
- kcn.fromBuilder(cb.re("text", "frau.|mann."));
+ kcn.fromBuilder(cb.re("text", "fra.|ma.n"));
assertEquals(3, kcn.docCount());
- };
+
+ String sv = fd.doc.getField("text").stringValue();
+ assertEquals("Der alte Mann ging über die Straße", sv);
+
+ kcn.fromBuilder(cb.term("text", sv));
+ assertEquals(1, kcn.docCount());
+
+ };
@Test
public void testIndexWithTextStringQueries () throws IOException {
@@ -435,18 +408,15 @@
CollectionBuilder cb = new CollectionBuilder();
KrillCollection kcn = new KrillCollection(ki);
- kcn.fromBuilder(cb.term("text", "mann~"));
+ kcn.fromBuilder(cb.term("text", "mann"));
assertEquals(1, kcn.docCount());
- kcn.fromBuilder(cb.text("text", "Mann"));
+ kcn.fromBuilder(cb.term("text", "Der alte Mann ging über die Straße"));
assertEquals(1, kcn.docCount());
- // Simple string tests
- kcn.fromBuilder(cb.text("text", "Der alte Mann"));
-
- // Uses german analyzer for the createDocument
- assertEquals(kcn.toString(), "QueryWrapperFilter(text:\"der~ alte~ mann~\")");
- assertEquals(1, kcn.docCount());
+ kcn.fromBuilder(cb.text("text", "Der alte Mann"));
+ assertEquals(kcn.toString(), "QueryWrapperFilter(text:\"der alte mann\")");
+ assertEquals(1, kcn.docCount());
};
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
index 2f6803a..ad69ffc 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
@@ -101,7 +101,7 @@
assertFalse(ks.hasErrors());
assertFalse(ks.hasWarnings());
assertFalse(ks.hasMessages());
- assertEquals("-QueryWrapperFilter(author:\"goethe~\")",
+ assertEquals("-QueryWrapperFilter(author:\"goethe\")",
ks.getCollection().toString());
};