Broken commit: Prepending in TextPrependTokenizer
Change-Id: Id84d258a433eb536239262682d7d2f4000caf077
diff --git a/pom.xml b/pom.xml
index 1094693..5b72473 100644
--- a/pom.xml
+++ b/pom.xml
@@ -196,19 +196,23 @@
annotation <dependency> <groupId>KorapAnnotationProcessor</groupId> <artifactId>KorapAnnotationProcessor</artifactId>
<version>0.0.1-SNAPSHOT</version> <scope>compile</scope> </dependency> -->
- <!-- Some language extensions like StringUtil -->
+ <!-- Some language extensions like StringUtil, IOUtils ... -->
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.3</version>
</dependency>
-
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.3.1</version>
</dependency>
- </dependencies>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>1.3.2</version>
+ </dependency>
+ </dependencies>
<build>
<sourceDirectory>${basedir}/src/main/java</sourceDirectory>
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
index 7ea159c..5875c3e 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TextAnalyzer.java
@@ -23,16 +23,8 @@
protected TokenStreamComponents createComponents (final String fieldName) {
final Tokenizer source = new TextPrependTokenizer();
TokenStream sink = new LowerCaseFilter(source);
- // sink = new TextTokenFilter(sink);
+ sink = new TextTokenFilter(sink);
// source.setVerbatim(this.verbatim);
return new TokenStreamComponents(source, sink);
};
-
-
- // Set verbatim
- /*
- public void setVerbatim (String value) {
- this.verbatim = value;
- }
- */
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java b/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java
index 6747584..0613d2c 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TextPrependTokenizer.java
@@ -5,14 +5,16 @@
// but prepends a verbatim string to the TokenStream
import java.io.IOException;
+import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-// de.ids_mannheim.korap.index.VerbatimAttr
import org.apache.lucene.analysis.standard.StandardTokenizerImpl;
-
import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.commons.io.IOUtils;
+import java.io.StringReader;
public final class TextPrependTokenizer extends Tokenizer {
@@ -37,6 +39,14 @@
}
private void init() {
+ /*try {
+ System.err.println(IOUtils.toString(reader));
+ }
+ catch (IOException io) {
+ System.err.println("Exception: " + io);
+ };
+ System.err.println(input.reset());
+ */
this.scanner = new StandardTokenizerImpl(input);
this.init = true;
}
diff --git a/src/main/java/de/ids_mannheim/korap/index/TextTokenFilter.java b/src/main/java/de/ids_mannheim/korap/index/TextTokenFilter.java
index 77f8d86..c86b13f 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TextTokenFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TextTokenFilter.java
@@ -25,13 +25,13 @@
@Override
public final boolean incrementToken() throws IOException {
+
// Prepend verbatim string
- if (this.initTerm && this.verbatim != null) {
+ if (this.initTerm) {
clearAttributes();
- termAtt.append(this.verbatim);
+ termAtt.append("[PREPEND2]");
posIncrAtt.setPositionIncrement(10000);
this.initTerm = false;
- this.verbatim = null;
return true;
};
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index a72033e..b7fba53 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -312,6 +312,8 @@
ts.reset();
ts.incrementToken();
+ assertEquals("[PREPEND2]", charTermAttribute.toString());
+ ts.incrementToken();
assertEquals("[prepend]", charTermAttribute.toString());
ts.incrementToken();
assertEquals("der", charTermAttribute.toString());
@@ -844,7 +846,7 @@
fd.addString("author", "Frank");
fd.addKeyword("textClass", "Nachricht Kultur Reisen");
fd.addInt("pubDate", 20051210);
- fd.addText("text", "Der alte Mann ging über die Straße");
+ fd.addText("text", "Der alte Mann ging über die Straße");
return fd;
};