Broken commit: Preparations for prepending tokenstream
Change-Id: I4c27ac21620e5bb88ec872d8ca9d5fd5ec2e9594
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
index dd6abb2..7ea159c 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
@@ -17,11 +17,22 @@
*/
public class TextAnalyzer extends Analyzer {
-
+ // private static String verbatim;
+
@Override
protected TokenStreamComponents createComponents (final String fieldName) {
- final Tokenizer source = new StandardTokenizer();
+ final Tokenizer source = new TextPrependTokenizer();
TokenStream sink = new LowerCaseFilter(source);
- return new TokenStreamComponents(source, sink);
+ // sink = new TextTokenFilter(sink);
+ // source.setVerbatim(this.verbatim);
+ return new TokenStreamComponents(source, sink);
};
+
+
+ // Set verbatim
+ /*
+ public void setVerbatim (String value) {
+ this.verbatim = value;
+ }
+ */
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java b/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java
new file mode 100644
index 0000000..6747584
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java
@@ -0,0 +1,97 @@
+package de.ids_mannheim.korap.index;
+
+// This code is pretty similar to
+// org.apache.lucene.analysis.standard.StandardTokenizer,
+// but prepends a verbatim string to the TokenStream
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+// de.ids_mannheim.korap.index.VerbatimAttr
+import org.apache.lucene.analysis.standard.StandardTokenizerImpl;
+
+import org.apache.lucene.util.AttributeFactory;
+
+
+public final class TextPrependTokenizer extends Tokenizer {
+
+ /** A private instance of the JFlex-constructed scanner */
+ private StandardTokenizerImpl scanner;
+
+ private int skippedPositions;
+
+ private String verbatim;
+ private Boolean init = true;
+
+ private int maxTokenLength = 1024 * 1024;
+
+ public TextPrependTokenizer() {
+ init();
+ }
+
+ public TextPrependTokenizer(AttributeFactory factory) {
+ super(factory);
+ init();
+ }
+
+ private void init() {
+ this.scanner = new StandardTokenizerImpl(input);
+ this.init = true;
+ }
+
+ public void setVerbatim (String v) {
+ this.verbatim = v;
+ };
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ clearAttributes();
+ skippedPositions = 0;
+
+ if (this.init) {
+ posIncrAtt.setPositionIncrement(10000);
+ termAtt.append("[PREPEND]");
+ this.init = false;
+ return true;
+ };
+
+ while(true) {
+ int tokenType = scanner.getNextToken();
+
+ if (tokenType == StandardTokenizerImpl.YYEOF) {
+ return false;
+ }
+
+ if (scanner.yylength() <= maxTokenLength) {
+ posIncrAtt.setPositionIncrement(skippedPositions+1);
+ scanner.getText(termAtt);
+ return true;
+ } else
+ skippedPositions++;
+ }
+ }
+
+ @Override
+ public final void end() throws IOException {
+ super.end();
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
+ }
+
+ @Override
+ public void close() throws IOException {
+ super.close();
+ scanner.yyreset(input);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ scanner.yyreset(input);
+ skippedPositions = 0;
+ }
+}
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextTokenFilter.java b/src/main/java/de/ids_mannheim/korap/index/TextTokenFilter.java
new file mode 100644
index 0000000..77f8d86
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/TextTokenFilter.java
@@ -0,0 +1,45 @@
+package de.ids_mannheim.korap.index;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/*
+ * THIS IS PROBABLY USELESS
+ */
+
+public final class TextTokenFilter extends TokenFilter {
+ private Boolean initTerm;
+ private static String verbatim;
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt =
+ addAttribute(PositionIncrementAttribute.class);
+
+ public TextTokenFilter(TokenStream in) {
+ super(in);
+ this.initTerm = true;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ // Prepend verbatim string
+ if (this.initTerm && this.verbatim != null) {
+ clearAttributes();
+ termAtt.append(this.verbatim);
+ posIncrAtt.setPositionIncrement(10000);
+ this.initTerm = false;
+ this.verbatim = null;
+ return true;
+ };
+
+ // IncrementToken
+ if (input.incrementToken()) {
+ return true;
+ };
+
+ return false;
+ }
+};
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index fe796fc..a72033e 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -151,10 +151,10 @@
.with(cb.term("textClass", "kultur")));
assertEquals(0, kcn.docCount());
- kcn.fromBuilder(cb.term("text", "mann"));
+ kcn.fromBuilder(cb.term("text", "mann~"));
assertEquals(3, kcn.docCount());
- kcn.fromBuilder(cb.term("text", "frau"));
+ kcn.fromBuilder(cb.term("text", "frau~"));
assertEquals(1, kcn.docCount());
};
@@ -312,6 +312,8 @@
ts.reset();
ts.incrementToken();
+ assertEquals("[prepend]", charTermAttribute.toString());
+ ts.incrementToken();
assertEquals("der", charTermAttribute.toString());
ts.incrementToken();
assertEquals("alte", charTermAttribute.toString());
@@ -412,13 +414,13 @@
kcn.fromBuilder(cb.text("text", "Frau"));
assertEquals(1, kcn.docCount());
- kcn.fromBuilder(cb.term("text", "frau"));
+ kcn.fromBuilder(cb.term("text", "frau~"));
assertEquals(1, kcn.docCount());
- kcn.fromBuilder(cb.re("text", "frau"));
+ kcn.fromBuilder(cb.re("text", "frau."));
assertEquals(1, kcn.docCount());
- kcn.fromBuilder(cb.re("text", "frau|mann"));
+ kcn.fromBuilder(cb.re("text", "frau.|mann."));
assertEquals(3, kcn.docCount());
};
@@ -431,7 +433,7 @@
CollectionBuilder cb = new CollectionBuilder();
KrillCollection kcn = new KrillCollection(ki);
- kcn.fromBuilder(cb.term("text", "mann"));
+ kcn.fromBuilder(cb.term("text", "mann~"));
assertEquals(1, kcn.docCount());
kcn.fromBuilder(cb.text("text", "Mann"));
@@ -441,7 +443,7 @@
kcn.fromBuilder(cb.text("text", "Der alte Mann"));
// Uses german analyzer for the createDocument
- assertEquals(kcn.toString(), "QueryWrapperFilter(text:\"der alte mann\")");
+ assertEquals(kcn.toString(), "QueryWrapperFilter(text:\"der~ alte~ mann~\")");
assertEquals(1, kcn.docCount());
};
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
index ad69ffc..2f6803a 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
@@ -101,7 +101,7 @@
assertFalse(ks.hasErrors());
assertFalse(ks.hasWarnings());
assertFalse(ks.hasMessages());
- assertEquals("-QueryWrapperFilter(author:\"goethe\")",
+ assertEquals("-QueryWrapperFilter(author:\"goethe~\")",
ks.getCollection().toString());
};