Broken commit: Preparations for prepending tokenstream

Change-Id: I4c27ac21620e5bb88ec872d8ca9d5fd5ec2e9594
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
index dd6abb2..7ea159c 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
@@ -17,11 +17,22 @@
  */
 
 public class TextAnalyzer extends Analyzer {
-
+	// private static String verbatim;
+	
     @Override
     protected TokenStreamComponents createComponents (final String fieldName) {
-        final Tokenizer source = new StandardTokenizer();
+		final Tokenizer source = new TextPrependTokenizer();
         TokenStream sink = new LowerCaseFilter(source);
-        return new TokenStreamComponents(source, sink);
+		// sink = new TextTokenFilter(sink);
+		// source.setVerbatim(this.verbatim);
+		return new TokenStreamComponents(source, sink);
     };
+
+
+	// Set verbatim
+	/*
+	public void setVerbatim (String value) {
+		this.verbatim = value;
+	}
+	*/
 };
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java b/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java
new file mode 100644
index 0000000..6747584
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java
@@ -0,0 +1,97 @@
+package de.ids_mannheim.korap.index;
+
+// This code is pretty similar to
+// org.apache.lucene.analysis.standard.StandardTokenizer,
+// but prepends a verbatim string to the TokenStream
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+//  de.ids_mannheim.korap.index.VerbatimAttr
+import org.apache.lucene.analysis.standard.StandardTokenizerImpl;
+
+import org.apache.lucene.util.AttributeFactory;
+
+
+public final class TextPrependTokenizer extends Tokenizer {
+
+	/** A private instance of the JFlex-constructed scanner */
+	private StandardTokenizerImpl scanner;
+
+	private int skippedPositions;
+
+	private String verbatim;
+	private Boolean init = true;
+
+	private int maxTokenLength = 1024 * 1024;
+
+	public TextPrependTokenizer() {
+		init();
+	}
+
+	public TextPrependTokenizer(AttributeFactory factory) {
+		super(factory);
+		init();
+	}
+
+	private void init() {
+		this.scanner = new StandardTokenizerImpl(input);
+		this.init = true;
+	}
+
+	public void setVerbatim (String v) {
+		this.verbatim = v;
+	};
+	
+	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+	private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+	@Override
+	public final boolean incrementToken() throws IOException {
+		clearAttributes();
+		skippedPositions = 0;
+
+		if (this.init) {
+			posIncrAtt.setPositionIncrement(10000);
+			termAtt.append("[PREPEND]");
+			this.init = false;
+			return true;
+		};
+
+		while(true) {
+			int tokenType = scanner.getNextToken();
+			
+			if (tokenType == StandardTokenizerImpl.YYEOF) {
+				return false;
+			}
+
+			if (scanner.yylength() <= maxTokenLength) {
+				posIncrAtt.setPositionIncrement(skippedPositions+1);
+				scanner.getText(termAtt);
+				return true;
+			} else
+				skippedPositions++;
+		}
+	}
+  
+	@Override
+	public final void end() throws IOException {
+		super.end();
+		posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions);
+	}
+
+	@Override
+	public void close() throws IOException {
+		super.close();
+		scanner.yyreset(input);
+	}
+
+	@Override
+	public void reset() throws IOException {
+		super.reset();
+		scanner.yyreset(input);
+		skippedPositions = 0;
+	}
+}
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextTokenFilter.java b/src/main/java/de/ids_mannheim/korap/index/TextTokenFilter.java
new file mode 100644
index 0000000..77f8d86
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/TextTokenFilter.java
@@ -0,0 +1,45 @@
+package de.ids_mannheim.korap.index;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/*
+ * THIS IS PROBABLY USELESS
+ */
+
+public final class TextTokenFilter extends TokenFilter {
+	private Boolean initTerm;
+	private static String verbatim;
+	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+	private final PositionIncrementAttribute posIncrAtt =
+		addAttribute(PositionIncrementAttribute.class);
+
+	public TextTokenFilter(TokenStream in) {
+		super(in);
+		this.initTerm = true;
+	}
+  
+	@Override
+	public final boolean incrementToken() throws IOException {
+		// Prepend verbatim string
+		if (this.initTerm && this.verbatim != null) {
+			clearAttributes();
+			termAtt.append(this.verbatim);
+            posIncrAtt.setPositionIncrement(10000);
+			this.initTerm = false;
+			this.verbatim = null;
+			return true;
+		};
+
+		// IncrementToken
+		if (input.incrementToken()) {
+			return true;
+		};
+
+		return false;
+	}
+};
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index fe796fc..a72033e 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -151,10 +151,10 @@
                 .with(cb.term("textClass", "kultur")));
         assertEquals(0, kcn.docCount());
 
-        kcn.fromBuilder(cb.term("text", "mann"));
+        kcn.fromBuilder(cb.term("text", "mann~"));
         assertEquals(3, kcn.docCount());
 
-        kcn.fromBuilder(cb.term("text", "frau"));
+        kcn.fromBuilder(cb.term("text", "frau~"));
         assertEquals(1, kcn.docCount());
     };
 
@@ -312,6 +312,8 @@
         ts.reset();
 
         ts.incrementToken();
+        assertEquals("[prepend]", charTermAttribute.toString());
+        ts.incrementToken();
         assertEquals("der", charTermAttribute.toString());
         ts.incrementToken();
         assertEquals("alte", charTermAttribute.toString());
@@ -412,13 +414,13 @@
 		kcn.fromBuilder(cb.text("text", "Frau"));
         assertEquals(1, kcn.docCount());
 
-        kcn.fromBuilder(cb.term("text", "frau"));
+        kcn.fromBuilder(cb.term("text", "frau~"));
         assertEquals(1, kcn.docCount());
 
-        kcn.fromBuilder(cb.re("text", "frau"));
+        kcn.fromBuilder(cb.re("text", "frau."));
         assertEquals(1, kcn.docCount());
 
-        kcn.fromBuilder(cb.re("text", "frau|mann"));
+        kcn.fromBuilder(cb.re("text", "frau.|mann."));
         assertEquals(3, kcn.docCount());
     };
 
@@ -431,7 +433,7 @@
         CollectionBuilder cb = new CollectionBuilder();
         KrillCollection kcn = new KrillCollection(ki);
 
-        kcn.fromBuilder(cb.term("text", "mann"));
+        kcn.fromBuilder(cb.term("text", "mann~"));
         assertEquals(1, kcn.docCount());
 
 		kcn.fromBuilder(cb.text("text", "Mann"));
@@ -441,7 +443,7 @@
         kcn.fromBuilder(cb.text("text", "Der alte Mann"));
 
 		// Uses german analyzer for the createDocument
-		assertEquals(kcn.toString(), "QueryWrapperFilter(text:\"der alte mann\")");
+		assertEquals(kcn.toString(), "QueryWrapperFilter(text:\"der~ alte~ mann~\")");
 		assertEquals(1, kcn.docCount());
 	};
 
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
index ad69ffc..2f6803a 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
@@ -101,7 +101,7 @@
         assertFalse(ks.hasErrors());
         assertFalse(ks.hasWarnings());
         assertFalse(ks.hasMessages());
-        assertEquals("-QueryWrapperFilter(author:\"goethe\")",
+        assertEquals("-QueryWrapperFilter(author:\"goethe~\")",
 					 ks.getCollection().toString());
     };