[cleanup] Making the MultiTerm-Family more robust for corrupted input data
diff --git a/CHANGES b/CHANGES
index 0648e5f..9aa88b1 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,6 @@
+0.30.4 2014-02-26
+ - [cleanup] Making MultiTerm* more robust.
+
0.30.3 2014-02-20
- Return json request in response if given (diewald)
- [bugfix] ClassSpans and WithinSpans check skipTo
diff --git a/pom.xml b/pom.xml
index e3d15d5..8203bdf 100644
--- a/pom.xml
+++ b/pom.xml
@@ -11,7 +11,7 @@
-->
<groupId>KorAP-modules</groupId>
<artifactId>KorAP-lucene-index</artifactId>
- <version>0.30.3</version>
+ <version>0.30.4</version>
<packaging>jar</packaging>
<name>KorAP-lucene-index</name>
diff --git a/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java b/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java
index 9c9e306..f9ff3d6 100644
--- a/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java
+++ b/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java
@@ -8,7 +8,7 @@
/**
* @author Nils Diewald
- * @version 0.2
+ * @version 0.3
*
* MultiTerm represents a term in a MultiTermToken.
*/
@@ -19,6 +19,12 @@
public boolean storeOffsets = false;
public BytesRef payload = null;
+ private static ByteBuffer bb = ByteBuffer.allocate(8);
+ private static String[] stringOffset;
+
+ private static short i, l;
+
+
/**
* The constructor.
*
@@ -34,16 +40,14 @@
MultiTerm test = new MultiTerm("test#0-4");
MultiTerm test = new MultiTerm("test#0-4$Example");
MultiTerm test = new MultiTerm("test#0-4$<i>1278");
+
+ Strings that are malformed fail silently.
*/
public MultiTerm (String term) {
- /*
- this.start = this.end = 0;
- this.storeOffsets = false;
- this.payload = null;
- */
_fromString(term);
};
+
/**
* The constructor with a separated prefix.
* new MultiTerm('a', "bcd") is equivalent to
@@ -56,90 +60,177 @@
*/
public MultiTerm (char prefix, String term) {
StringBuilder sb = new StringBuilder();
- /*
- this.start = this.end = 0;
- this.storeOffsets = false;
- this.payload = null;
- */
- sb.append(prefix).append(':').append(term);
- _fromString(sb.toString());
+ _fromString(sb.append(prefix).append(':').append(term).toString());
};
-
- public void term (String term) {
- this.term = term;
- };
-
- public String term () {
- return this.term;
- };
-
+
/**
- * The constructor.
+ * The empty constructor.
*/
public MultiTerm () {
this.term = "";
- /*
- this.start = this.end = 0;
- this.storeOffsets = false;
- this.payload = null;
- */
};
- public void payload (Byte pl) {
+
+ /**
+ * Sets the term value.
+ *
+ * @param term The term as a string
+ */
+ public void setTerm (String term) {
+ this.term = term;
+ };
+
+
+ /**
+ * Returns the term value.
+ *
+ * @return The term value.
+ */
+ public String getTerm () {
+ return this.term;
+ };
+
+
+ /**
+ * Set the payload as a byte value.
+ *
+ * @param pl The payload.
+ */
+ public void setPayload (Byte pl) {
this.payload = new BytesRef( ByteBuffer.allocate(1).put(pl).array());
};
- public void payload (short pl) {
+
+ /**
+ * Set the payload as a short value.
+ *
+ * @param pl The payload.
+ */
+ public void setPayload (short pl) {
this.payload = new BytesRef( ByteBuffer.allocate(2).putShort(pl).array());
};
- public void payload (int pl) {
+
+ /**
+ * Set the payload as an integer value.
+ *
+ * @param pl The payload.
+ */
+ public void setPayload (int pl) {
this.payload = new BytesRef( ByteBuffer.allocate(4).putInt(pl).array());
};
- public void payload (long pl) {
+
+ /**
+ * Set the payload as a long value.
+ *
+ * @param pl The payload.
+ */
+ public void setPayload (long pl) {
this.payload = new BytesRef( ByteBuffer.allocate(8).putLong(pl).array());
};
- public void payload (String pl) {
+
+ /**
+ * Set the payload as a string value.
+ *
+ * @param pl The payload.
+ */
+ public void setPayload (String pl) {
this.payload = new BytesRef(pl);
};
- public void payload (byte[] pl) {
+
+ /**
+ * Set the payload as a byte array.
+ *
+ * @param pl The payload.
+ */
+ public void setPayload (byte[] pl) {
this.payload = new BytesRef(pl);
};
- public void payload (BytesRef pl) {
+
+ /**
+ * Set the payload as a BytesRef.
+ *
+ * @param pl The payload.
+ */
+ public void setPayload (BytesRef pl) {
this.payload = pl;
};
- public BytesRef payload () {
+ /**
+ * Get the payload.
+ *
+ * @return The payload as a BytesRef.
+ */
+ public BytesRef getPayload () {
return this.payload;
};
- public void start (int value) {
+
+ /**
+ * Set the start position of the term.
+ *
+ * @param The start position.
+ */
+ public void setStart (int value) {
this.start = value;
};
- public int start () {
+
+ /**
+ * Get the start position.
+ *
+ * @return The start position.
+ */
+ public int getStart () {
return this.start;
};
- public void end (int value) {
+
+ /**
+ * Set the end position of the term.
+ *
+ * @param The end position.
+ */
+ public void setEnd (int value) {
this.end = value;
};
- public int end () {
+
+ /**
+ * Get the end position.
+ *
+ * @return The end position.
+ */
+ public int getEnd () {
return this.end;
};
- public boolean storeOffsets () {
+
+ /**
+ * Set the flag for stored offsets.
+ *
+ * @param value Boolean value indicating that the term
+ * contains stored offsets.
+ */
+ public void hasStoredOffsets (boolean value) {
+ this.storeOffsets = value;
+ };
+
+
+ /**
+ * Check if there are offsets stored.
+ *
+ * @return Boolean value indicating that the term
+ * contains stored offsets.
+ */
+ public boolean hasStoredOffsets () {
return this.storeOffsets;
};
- public void storeOffsets (boolean value) {
- this.storeOffsets = value;
- };
private void _fromString (String term) {
String[] termSurface = term.split("\\$", 2);
@@ -150,71 +241,52 @@
// Payload has a type
if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') {
- ByteBuffer bb = ByteBuffer.allocate(8);
+ // Rewind bytebuffer
+ bb.rewind();
+
+ // Split payload at type marker boundaries
String[] pls = payloadStr.split("(?=<)|(?<=>)");
- int l = 0;
- for (int i = 1; i < pls.length;) {
+ l = 0; // Bytearray length
- // Resize the buffer
- if ((bb.capacity() - l) < 8) {
- bb = ByteBuffer.allocate(bb.capacity() + 8).put(bb.array());
- bb.position(l);
+ try {
+ for (i = 1; i < pls.length;) {
+
+ // Resize the bytebuffer
+ if ((bb.capacity() - l) < 8) {
+ bb = ByteBuffer.allocate(bb.capacity() + 8)
+ .put(bb.array());
+ bb.position(l);
+ };
+
+ switch (pls[i]) {
+ case "<b>": // byte
+ bb.put(Byte.parseByte(pls[i+1]));
+ l++;
+ break;
+ case "<s>": // short
+ bb.putShort(Short.parseShort(pls[i+1]));
+ l+=2;
+ break;
+ case "<i>": // integer
+ bb.putInt(Integer.parseInt(pls[i+1]));
+ l+=4;
+ break;
+ case "<l>": // long
+ bb.putLong(Long.parseLong(pls[i+1]));
+ l+=8;
+ break;
+ };
+ i+=2;
};
- switch (pls[i]) {
- case "<b>": // byte
- bb.put(Byte.parseByte(pls[i+1]));
- l++;
- break;
- case "<s>":
- bb.putShort(Short.parseShort(pls[i+1]));
- l+=2;
- break;
- case "<i>":
- bb.putInt(Integer.parseInt(pls[i+1]));
- l+=4;
- break;
- case "<l>":
- bb.putLong(Long.parseLong(pls[i+1]));
- l+=8;
- break;
- };
- i+=2;
+
+ byte[] bytes = new byte[l];
+ System.arraycopy(bb.array(), 0, bytes, 0, l);
+ this.payload = new BytesRef(bytes);
+ }
+ catch (Exception e) {
};
- byte[] bytes = new byte[l];
- System.arraycopy(bb.array(), 0, bytes, 0, l);
- this.payload = new BytesRef(bytes);
-
-
- /*
- payloadStr = payloadStr.substring(3, payloadStr.length());
- switch (type) {
- case 'b': // byte
-
- System.err.println("bbb");
- payloadBytes = ByteBuffer.allocate(1).put(new Byte(payloadStr)).array();
- break;
- case 's': // short
- payloadBytes = ByteBuffer.allocate(2).putShort(
- Short.parseShort(payloadStr)
- ).array();
- break;
- case 'i': // integer
- payloadBytes = ByteBuffer.allocate(4).putInt(
- Integer.parseInt(payloadStr)
- ).array();
- break;
- case 'l': // long
- payloadBytes = ByteBuffer.allocate(8).putLong(
- Long.parseLong(payloadStr)
- ).array();
- break;
- };
- TODO:
- case '?': // arbitrary
- payloadStr =
- */
}
// Payload is a string
@@ -222,18 +294,24 @@
this.payload = new BytesRef(payloadStr);
};
};
- String[] stringOffset = termSurface[0].split("\\#", 2);
- if (stringOffset.length == 2) {
- String[] offset = stringOffset[1].split("\\-", 2);
+
+ // Parse offset information
+ stringOffset = termSurface[0].split("\\#", 2);
+ if (stringOffset.length == 2) {
+
+ // Split start and end position of the offset
+ String[] offset = stringOffset[1].split("\\-", 2);
+
+ // Start and end is given
if (offset.length == 2 && offset[0].length() > 0) {
- this.start = Integer.parseInt(offset[0]);
- this.end = Integer.parseInt(offset[1]);
- /*
- }
- else {
- this.storeOffsets(false);
- */
+ try {
+ this.start = Integer.parseInt(offset[0]);
+ this.end = Integer.parseInt(offset[1]);
+
+ }
+ catch (NumberFormatException e) {
+ };
};
};
this.term = stringOffset[0];
@@ -249,14 +327,14 @@
* @see #toStringShort().
*/
public String toString () {
+
StringBuilder sb = new StringBuilder(this.term);
+
if (this.start != this.end) {
- sb.append('#').append(this.start).append('-').append(this.end);
- /*
- }
- else if (!this.storeOffsets()) {
- sb.append("#-");
- */
+ sb.append('#')
+ .append(this.start)
+ .append('-')
+ .append(this.end);
};
if (this.payload != null) {
@@ -265,7 +343,8 @@
sb.append(this.payload.utf8ToString());
}
catch (AssertionError e) {
- sb.append("<?>").append(join(',', this.payload.toString().split(" ")));
+ sb.append("<?>")
+ .append(this.payload.toString().replace(' ', ','));
};
};
@@ -283,7 +362,14 @@
public String toStringShort () {
StringBuilder sb = new StringBuilder(this.term);
if (this.payload != null) {
- sb.append('$').append(this.payload.utf8ToString());
+ sb.append('$');
+ try {
+ sb.append(this.payload.utf8ToString());
+ }
+ catch (AssertionError e) {
+ sb.append("<?>")
+ .append(this.payload.toString().replace(' ', ','));
+ };
};
return sb.toString();
};
diff --git a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java
index ff70996..9af4156 100644
--- a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java
+++ b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java
@@ -3,12 +3,6 @@
import de.ids_mannheim.korap.analysis.MultiTerm;
import java.util.*;
-/*
- Todo:
- - Always write offsets to payloads!
- - Offsets can be overwritten!
- - Check that terms are not ""!!!
-*/
/**
* @author Nils Diewald
@@ -19,8 +13,15 @@
public int start, end = 0;
public List<MultiTerm> terms;
+ private static short i = 0;
+
+ /**
+ * The constructor.
+ *
+ * @param terms Take at least one MultiTerm object for a token.
+ */
public MultiTermToken (MultiTerm term, MultiTerm ... moreTerms) {
- this.terms = new ArrayList<MultiTerm>();
+ this.terms = new ArrayList<MultiTerm>(16);
if (term.start != term.end) {
this.start = term.start;
@@ -31,91 +32,143 @@
terms.add( term );
// Further elements on same position
- for (int i = 0; i < moreTerms.length; i++) {
+ for (i = 0; i < moreTerms.length; i++) {
term = moreTerms[i];
term.posIncr = 0;
terms.add(term);
};
};
+
+ /**
+ * The constructor.
+ *
+ * @param prefix A term prefix.
+ * @param surface A surface string.
+ */
public MultiTermToken (char prefix, String surface) {
- this.terms = new ArrayList<MultiTerm>();
+ this.terms = new ArrayList<MultiTerm>(16);
MultiTerm term = new MultiTerm(prefix, surface);
- if (term.start != term.end) {
- this.start = term.start;
- this.end = term.end;
- };
+ this.setOffset(term.start, term.end);
// First word element
term.posIncr = 1;
terms.add( term );
};
+
-
+ /**
+ * The constructor.
+ *
+ * @param prefix At least one term surface string.
+ */
public MultiTermToken (String surface, String ... moreTerms) {
- this.terms = new ArrayList<MultiTerm>();
+ this.terms = new ArrayList<MultiTerm>(16);
MultiTerm term = new MultiTerm(surface);
- if (term.start != term.end) {
- this.start = term.start;
- this.end = term.end;
- };
+ this.setOffset(term.start, term.end);
// First word element
term.posIncr = 1;
terms.add( term );
-
// Further elements on same position
- for (int i = 0; i < moreTerms.length; i++) {
-
+ for (i = 0; i < moreTerms.length; i++) {
term = new MultiTerm( moreTerms[i] );
+ this.setOffset(term.start, term.end);
term.posIncr = 0;
terms.add(term);
};
};
+
+ /**
+ * Add a new term to the MultiTermToken.
+ *
+ * @param mt A MultiTerm.
+ */
public void add (MultiTerm mt) {
+ mt.posIncr = 0;
+ this.setOffset(mt.start, mt.end);
terms.add(mt);
};
+
+ /**
+ * Add a new term to the MultiTermToken.
+ *
+ * @param term A surface string.
+ */
public void add (String term) {
+ if (term.length() == 0)
+ return;
MultiTerm mt = new MultiTerm(term);
+ this.setOffset(mt.start, mt.end);
mt.posIncr = 0;
terms.add(mt);
};
+ /**
+ * Add a new term to the MultiTermToken.
+ *
+ * @param prefix A prefix character for the surface string.
+ * @param term A surface string.
+ */
public void add (char prefix, String term) {
+ if (term.length() == 0)
+ return;
MultiTerm mt = new MultiTerm(prefix, term);
+ this.setOffset(mt.start, mt.end);
mt.posIncr = 0;
terms.add(mt);
};
- public void offset (int start, int end) {
- this.start = start;
- this.end = end;
+
+ /**
+ * Sets the offset information of the MultiTermToken.
+ *
+ * @param start The character position of the token start.
+ * @param end The character position of the token end.
+ */
+ public void setOffset (int start, int end) {
+ if (start != end) {
+ this.start = (this.start == 0 || start < this.start) ? start : this.start;
+ this.end = end > this.end ? end : this.end;
+ };
};
+ /**
+ * Serialize the MultiTermToken to a string.
+ *
+ * @return A string representation of the token, with leading offset information.
+ */
public String toString () {
StringBuffer sb = new StringBuffer();
sb.append('[');
if (this.start != this.end) {
- sb.append('(').append(this.start).append('-').append(this.end).append(')');
+ sb.append('(')
+ .append(this.start)
+ .append('-')
+ .append(this.end)
+ .append(')');
};
- int i = 0;
+ i = 0;
for (; i < this.terms.size() - 1; i++) {
- sb.append(this.terms.get(i).toStringShort()).append('|');
+ sb.append(this.terms.get(i).toString()).append('|');
};
- sb.append(this.terms.get(i).toStringShort()).append(']');
+ sb.append(this.terms.get(i).toString()).append(']');
return sb.toString();
};
+ /**
+ * Return the number of MultiTerms in the MultiTermToken.
+ */
public int size () {
return this.terms.size();
};
diff --git a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java
index 66462d2..b3d2a6f 100644
--- a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java
+++ b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java
@@ -10,7 +10,6 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-// import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -21,13 +20,13 @@
/*
Todo:
- - Do not use offsetAttr!
-# - Payload is [4ByteStartOffset][14BitEndOffset-startOffset][1BitBooleanIfSpan][1BitBooleanIfOpen]
- - Payload is [4ByteOffsetStart][4ByteOffsetStart]
+ - !Payload is [4ByteStartOffset][14BitEndOffset-startOffset][1BitBooleanIfSpan][1BitBooleanIfOpen]
+ - Payload is [4ByteOffsetStart][4ByteOffsetStart]
*/
/**
* @author Nils Diewald
+ * @version 0.3
*
* MultiTermTokenStream extends Lucenes TokenStream class to work with MultiTermTokens.
*
@@ -35,181 +34,240 @@
*/
public class MultiTermTokenStream extends TokenStream {
private CharTermAttribute charTermAttr;
- // private OffsetAttribute offsetAttr;
private PositionIncrementAttribute posIncrAttr;
private PayloadAttribute payloadAttr;
- private static Pattern pattern = Pattern.compile("\\[(\\(([0-9]+)-([0-9]+)\\))?([^\\]]+?)\\]");
-
- private List<MultiTermToken> multiTermTokens;
- private int mttIndex = 0;
- private int mtIndex = 0;
- // private TokenTextGenerator ttGen = new TokenTextGenerator();
-
- private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);
+ private static final Pattern pattern = Pattern.compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]");
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
+ private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);
+ private List<MultiTermToken> multiTermTokens;
+ private int mttIndex = 0, mtIndex = 0;
+ private static short i = 0;
+
+
+ /**
+ * The empty Constructor.
+ */
public MultiTermTokenStream () {
- // this.offsetAttr = this.addAttribute(OffsetAttribute.class);
- this.charTermAttr = this.addAttribute(CharTermAttribute.class);
- this.posIncrAttr = this.addAttribute(PositionIncrementAttribute.class);
- this.payloadAttr = this.addAttribute(PayloadAttribute.class);
- this.multiTermTokens = new ArrayList<MultiTermToken>();
-
- /*
- if (!indexTokens.isEmpty()){
- indexTokens.get(indexTokens.size() - 1).setIncrement(false);
- };
- */
+ this.charTermAttr = this.addAttribute(CharTermAttribute.class);
+ this.posIncrAttr = this.addAttribute(PositionIncrementAttribute.class);
+ this.payloadAttr = this.addAttribute(PayloadAttribute.class);
+ this.multiTermTokens = new ArrayList<MultiTermToken>(100);
};
+
+ /**
+ * The Constructor.
+ *
+ * @param stream The MultiTermTokenStream as a string representation.
+ */
public MultiTermTokenStream (String stream) {
this();
- int pos = 0;
-
Matcher matcher = pattern.matcher(stream);
while (matcher.find()) {
- String[] seg = matcher.group(4).split("\\|");
+ String[] seg = matcher.group(1).split("\\|");
MultiTermToken mtt = new MultiTermToken( seg[0] );
- if (matcher.group(2) != null)
- mtt.start = Integer.parseInt(matcher.group(2));
-
- if (matcher.group(3) != null)
- mtt.end = Integer.parseInt(matcher.group(3));
-
- for (int i = 1; i < seg.length; i++)
+ for (i = 1; i < seg.length; i++)
mtt.add(seg[i]);
this.addMultiTermToken(mtt);
};
};
+
+ /**
+ * Add a MultiTermToken to the end of the MultiTermTokenStream.
+ *
+ * @param mtt A MultiTermToken.
+ */
public void addMultiTermToken (MultiTermToken mtt) {
this.multiTermTokens.add(mtt);
};
+
+ /**
+ * Add a MultiTermToken by means of MultiTerms to the end of
+ * the MultiTermTokenStream.
+ *
+ * @param term At least one MultiTerm.
+ */
public void addMultiTermToken (MultiTerm term, MultiTerm ... moreTerms) {
this.addMultiTermToken(new MultiTermToken(term, moreTerms));
};
+
+ /**
+ * Add a MultiTermToken by means of a single MultiTerm to the end of
+ * the MultiTermTokenStream.
+ *
+ * @param prefix A prefix character of a surface form of a MultiTerm.
+ * @param surface A surface string of a MultiTerm.
+ */
public void addMultiTermToken (char prefix, String surface) {
this.addMultiTermToken(new MultiTermToken(prefix, surface));
};
+
+ /**
+ * Add a MultiTermToken by means of a a series of surface strings
+ * to the end of the MultiTermTokenStream.
+ *
+ * @param surface At least one surface string of a MultiTerm.
+ */
public void addMultiTermToken (String surface, String ... moreTerms) {
this.addMultiTermToken(new MultiTermToken(surface, moreTerms));
};
+
+ /**
+ * Add meta information to the MultiTermTokenStream.
+ *
+ * @param key A string for denoting the meta information.
+ * @param value The value of the meta key as a string.
+ */
public void addMeta (String key, String value) {
MultiTerm mt = new MultiTerm('-', key);
- // mt.storeOffsets(false);
- mt.payload(value);
+ mt.setPayload(value);
this.multiTermTokens.get(0).add(mt);
};
+
+ /**
+ * Add meta information to the MultiTermTokenStream.
+ *
+ * @param key A string for denoting the meta information.
+ * @param value The value of the meta key as a byte array.
+ */
public void addMeta (String key, byte[] value) {
MultiTerm mt = new MultiTerm('-', key);
- // mt.storeOffsets(false);
- mt.payload(value);
+ mt.setPayload(value);
this.multiTermTokens.get(0).add(mt);
};
+ /**
+ * Add meta information to the MultiTermTokenStream.
+ *
+ * @param key A string for denoting the meta information.
+ * @param value The value of the meta key as a short value.
+ */
public void addMeta (String key, short value) {
MultiTerm mt = new MultiTerm('-', key);
- // mt.storeOffsets(false);
- mt.payload(value);
+ mt.setPayload(value);
this.multiTermTokens.get(0).add(mt);
};
+
+ /**
+ * Add meta information to the MultiTermTokenStream.
+ *
+ * @param key A string for denoting the meta information.
+ * @param value The value of the meta key as a long value.
+ */
public void addMeta (String key, long value) {
MultiTerm mt = new MultiTerm('-', key);
- // mt.storeOffsets(false);
- mt.payload(value);
+ mt.setPayload(value);
this.multiTermTokens.get(0).add(mt);
};
+
+ /**
+ * Add meta information to the MultiTermTokenStream.
+ *
+ * @param key A string for denoting the meta information.
+ * @param value The value of the meta key as a integer value.
+ */
public void addMeta (String key, int value) {
MultiTerm mt = new MultiTerm('-', key);
- // mt.storeOffsets(false);
- mt.payload(value);
+ mt.setPayload(value);
this.multiTermTokens.get(0).add(mt);
};
+
+ /**
+ * Increment the token in the MultiTermTokenStream.
+ * This overrides the function in Lucene's TokenStream.
+ */
@Override
public final boolean incrementToken() throws IOException {
this.payloadAttr.setPayload(null);
+ // Last token reached
if (this.multiTermTokens.size() == this.mttIndex) {
reset();
return false;
};
+ // Get current token
MultiTermToken mtt = this.multiTermTokens.get( this.mttIndex );
+ // Last term reached
if (mtt.terms.size() == this.mtIndex) {
this.mtIndex = 0;
this.mttIndex++;
+
+ // Last term of last token reached
if (this.multiTermTokens.size() == this.mttIndex) {
reset();
return false;
}
+
+ // Get last token
else {
mtt = this.multiTermTokens.get( this.mttIndex );
};
};
+ // Get current term
MultiTerm mt = mtt.terms.get(this.mtIndex);
- // Get the current index token
-
// Set the relative position to the former term
posIncrAttr.setPositionIncrement( mt.posIncr );
charTermAttr.setEmpty();
charTermAttr.append( mt.term );
BytesRef payload = new BytesRef();
+
+ // There is offset information
if (mt.start != mt.end) {
if (DEBUG)
log.trace("MultiTerm with payload offset: {}-{}", mt.start, mt.end);
+
+ // Add offsets to BytesRef payload
payload.append(new BytesRef(int2byte(mt.start)));
payload.append(new BytesRef(int2byte(mt.end)));
- /*
- }
- else if (mtt.start != mtt.end) {
- payload.append(new BytesRef(int2byte(mtt.start)));
- payload.append(new BytesRef(int2byte(mtt.end)));
- */
};
- // Payload
+ // There is payload in the MultiTerm
if (mt.payload != null) {
- payload.append(mt.payload());
+ payload.append(mt.payload);
if (DEBUG)
log.trace("Create payload[1] {}", payload.toString());
};
+ // There is payload in the current token to index
if (payload.length > 0) {
+ payloadAttr.setPayload(payload);
if (DEBUG)
log.trace("Set payload[2] {}", payload.toString());
- payloadAttr.setPayload(payload);
};
- if (log.isTraceEnabled()) {
+ if (DEBUG) {
StringBuilder sb = new StringBuilder("Index: [");
sb.append(mt.term);
if (payload.length > 0)
sb.append('$').append(payload.toString());
sb.append(']');
sb.append(" with increment ").append(mt.posIncr);
- if (DEBUG)
- log.trace(sb.toString());
+
+ log.trace(sb.toString());
};
this.mtIndex++;
diff --git a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTerm.java b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTerm.java
index 9e33133..c6929ec 100644
--- a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTerm.java
+++ b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTerm.java
@@ -68,10 +68,12 @@
@Test
public void multiTermStringPayloadType2 () {
MultiTerm mt = new MultiTerm();
- mt.term("beispiel");
- mt.start(40);
- mt.end(50);
- mt.payload((int) 4000);
+ mt.setTerm("beispiel");
+ mt.setStart(40);
+ assertEquals(mt.getStart(), mt.start);
+ mt.setEnd(50);
+ assertEquals(mt.getEnd(), mt.end);
+ mt.setPayload((int) 4000);
assertEquals("beispiel#40-50$<?>[0,0,f,a0]", mt.toString());
};
@@ -99,4 +101,20 @@
mt = new MultiTerm("example$<l>4000<b>120");
assertEquals("example$<?>[0,0,0,0,0,0,f,a0,78]", mt.toString());
};
+
+ @Test
+ public void multiTermStringFail () {
+ MultiTerm mt = new MultiTerm("example#56-66");
+ assertEquals(56, mt.getStart());
+ assertEquals(66,mt.getEnd());
+
+ mt = new MultiTerm("example#56-66$<i>a");
+ assertEquals(56, mt.getStart());
+ assertEquals(66, mt.getEnd());
+
+ mt = new MultiTerm("example#56$<i>a");
+ assertEquals(mt.getPayload(), null);
+ assertEquals(mt.getStart(), 0);
+ assertEquals(mt.getEnd(), 0);
+ };
};
diff --git a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java
index ab071c2..72263ba 100644
--- a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java
+++ b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java
@@ -19,20 +19,20 @@
mtt.add("b:banane");
assertEquals("[t:test|a:abbruch|b:banane]", mtt.toString());
mtt.add("c:chaos#21-26");
- assertEquals("[t:test|a:abbruch|b:banane|c:chaos]", mtt.toString());
- mtt.add("d:dadaismus#21-26$vergleich");
- assertEquals("[t:test|a:abbruch|b:banane|c:chaos|d:dadaismus$vergleich]", mtt.toString());
+ assertEquals("[(21-26)t:test|a:abbruch|b:banane|c:chaos#21-26]", mtt.toString());
+ mtt.add("d:dadaismus#21-28$vergleich");
+ assertEquals("[(21-28)t:test|a:abbruch|b:banane|c:chaos#21-26|d:dadaismus#21-28$vergleich]", mtt.toString());
};
@Test
public void multiTermTokenOffsets () {
MultiTermToken mtt = new MultiTermToken("t:test#23-27");
- assertEquals("[(23-27)t:test]", mtt.toString());
+ assertEquals("[(23-27)t:test#23-27]", mtt.toString());
mtt.add("b:baum#34-45");
- assertEquals("[(23-27)t:test|b:baum]", mtt.toString());
+ assertEquals("[(23-45)t:test#23-27|b:baum#34-45]", mtt.toString());
mtt.add("c:cannonball#34-45$tatsache");
- assertEquals("[(23-27)t:test|b:baum|c:cannonball$tatsache]", mtt.toString());
+ assertEquals("[(23-45)t:test#23-27|b:baum#34-45|c:cannonball#34-45$tatsache]", mtt.toString());
assertEquals(23, mtt.start);
- assertEquals(27, mtt.end);
+ assertEquals(45, mtt.end);
};
};
diff --git a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java
index b80ded6..21b8cca 100644
--- a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java
+++ b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java
Binary files differ
diff --git a/src/test/java/de/ids_mannheim/korap/benchmark/TestBenchmarkElementSpans.java b/src/test/java/de/ids_mannheim/korap/benchmark/TestBenchmarkElementSpans.java
index cdaf9fe..7bfed40 100644
--- a/src/test/java/de/ids_mannheim/korap/benchmark/TestBenchmarkElementSpans.java
+++ b/src/test/java/de/ids_mannheim/korap/benchmark/TestBenchmarkElementSpans.java
@@ -150,6 +150,12 @@
// 10 times / 350 docs:
// 36.26158006 seconds
// 32.52575097 seconds
+ // 31.818091536 seconds
+ // 32.055321123 seconds
+ // 32.32125959 seconds
+ // 31.726277979 seconds
+ // 31.65826188 seconds
+ // 31.287057537 seconds
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index 65d9245..51560d6 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -130,6 +130,19 @@
"... [{f/m:acht:b}{f/m:neun:a}] ...",
km.getSnippetBrackets());
+
+ /*
+ km = ki.getMatchInfo("match-c1!d1-p7-9(0)8-8(2)7-8",
+ "tokens",
+ "f",
+ null,
+ false,
+ false);
+
+ System.err.println(km.toJSON());
+ */
+
+
km = ki.getMatchInfo("match-c1!d1-p7-9(0)8-8(2)7-8",
"tokens",
"f",