[cleanup] Making the MultiTerm-Family more robust for corrupted input data

commit: d0d6feb6cd96c233fc69cae13b3a3dfbf5993414 [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Wed Feb 26 18:51:08 2014 +0000
committer: Nils Diewald <nils@diewald-online.de> Wed Feb 26 18:51:08 2014 +0000
tree: e17d45a7e29c188589ee46f90d89de785f82f142
parent: b76d498b89bc01508669d5ce107ce15729a863c9 [diff]
diff --git a/CHANGES b/CHANGES
index 0648e5f..9aa88b1 100644
--- a/CHANGES
+++ b/CHANGES

@@ -1,3 +1,6 @@
+0.30.4 2014-02-26
+        - [cleanup] Making MultiTerm* more robust.
+
 0.30.3 2014-02-20
         - Return json request in response if given (diewald)
 	- [bugfix] ClassSpans and WithinSpans check skipTo

diff --git a/pom.xml b/pom.xml
index e3d15d5..8203bdf 100644
--- a/pom.xml
+++ b/pom.xml

@@ -11,7 +11,7 @@
 -->
   <groupId>KorAP-modules</groupId>
   <artifactId>KorAP-lucene-index</artifactId>
-  <version>0.30.3</version>
+  <version>0.30.4</version>
   <packaging>jar</packaging>
 
   <name>KorAP-lucene-index</name>

diff --git a/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java b/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java
index 9c9e306..f9ff3d6 100644
--- a/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java
+++ b/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java

@@ -8,7 +8,7 @@
 
 /**
  * @author Nils Diewald
- * @version 0.2
+ * @version 0.3
  *
  * MultiTerm represents a term in a MultiTermToken.
  */
@@ -19,6 +19,12 @@
     public boolean storeOffsets = false;
     public BytesRef payload = null;
 
+    private static ByteBuffer bb = ByteBuffer.allocate(8);
+    private static String[] stringOffset;
+
+    private static short i, l;
+
+
     /**
      * The constructor.
      *
@@ -34,16 +40,14 @@
 	      MultiTerm test = new MultiTerm("test#0-4");
 	      MultiTerm test = new MultiTerm("test#0-4$Example");
 	      MultiTerm test = new MultiTerm("test#0-4$&lt;i&gt;1278");
+
+	      Strings that are malformed fail silently.
      */
     public MultiTerm (String term) {
-	/*
-	this.start = this.end = 0;
-	this.storeOffsets = false;
-	this.payload = null;
-	*/
 	_fromString(term);
     };
 
+    
     /**
      * The constructor with a separated prefix.
      * new MultiTerm('a', "bcd") is equivalent to
@@ -56,90 +60,177 @@
      */
     public MultiTerm (char prefix, String term) {
 	StringBuilder sb = new StringBuilder();
-	/*
-	this.start = this.end = 0;
-	this.storeOffsets = false;
-	this.payload = null;
-	*/
-	sb.append(prefix).append(':').append(term);
-	_fromString(sb.toString());
+	_fromString(sb.append(prefix).append(':').append(term).toString());
     };
-
-    public void term (String term) {
-	this.term = term;
-    };
-
-    public String term () {
-	return this.term;
-    };
-
+    
     /**
-     * The constructor.
+     * The empty constructor.
      */
     public MultiTerm () {
 	this.term = "";
-	/*
-	this.start = this.end = 0;
-	this.storeOffsets = false;
-	this.payload = null;
-	*/
     };
 
-    public void payload (Byte pl) {
+    
+    /**
+     * Sets the term value.
+     *
+     * @param term The term as a string
+     */
+    public void setTerm (String term) {
+	this.term = term;
+    };
+
+
+    /**
+     * Returns the term value.
+     *
+     * @return The term value.
+     */
+    public String getTerm () {
+	return this.term;
+    };
+
+    
+    /**
+     * Set the payload as a byte value.
+     *
+     * @param pl The payload.
+     */
+    public void setPayload (Byte pl) {
 	this.payload = new BytesRef( ByteBuffer.allocate(1).put(pl).array());
     };
 
-    public void payload (short pl) {
+    
+    /**
+     * Set the payload as a short value.
+     *
+     * @param pl The payload.
+     */
+    public void setPayload (short pl) {
 	this.payload = new BytesRef( ByteBuffer.allocate(2).putShort(pl).array());
     };
 
-    public void payload (int pl) {
+
+    /**
+     * Set the payload as an integer value.
+     *
+     * @param pl The payload.
+     */
+    public void setPayload (int pl) {
 	this.payload = new BytesRef( ByteBuffer.allocate(4).putInt(pl).array());
     };
 
-    public void payload (long pl) {
+    
+    /**
+     * Set the payload as a long value.
+     *
+     * @param pl The payload.
+     */
+    public void setPayload (long pl) {
 	this.payload = new BytesRef( ByteBuffer.allocate(8).putLong(pl).array());
     };
 
-    public void payload (String pl) {
+
+    /**
+     * Set the payload as a string value.
+     *
+     * @param pl The payload.
+     */
+    public void setPayload (String pl) {
 	this.payload = new BytesRef(pl);
     };
 
-    public void payload (byte[] pl) {
+
+    /**
+     * Set the payload as a byte array.
+     *
+     * @param pl The payload.
+     */
+    public void setPayload (byte[] pl) {
 	this.payload = new BytesRef(pl);
     };
 
-    public void payload (BytesRef pl) {
+
+    /**
+     * Set the payload as a BytesRef.
+     *
+     * @param pl The payload.
+     */
+    public void setPayload (BytesRef pl) {
 	this.payload = pl;
     };
 
-    public BytesRef payload () {
+    /**
+     * Get the payload.
+     *
+     * @return The payload as a BytesRef.
+     */
+    public BytesRef getPayload () {
 	return this.payload;
     };
 
-    public void start (int value) {
+
+    /**
+     * Set the start position of the term.
+     *
+     * @param The start position.
+     */
+    public void setStart (int value) {
 	this.start = value;
     };
 
-    public int start () {
+
+    /**
+     * Get the start position.
+     *
+     * @return The start position.
+     */
+    public int getStart () {
 	return this.start;
     };
 
-    public void end (int value) {
+
+    /**
+     * Set the end position of the term.
+     *
+     * @param The end position.
+     */
+    public void setEnd (int value) {
 	this.end = value;
     };
 
-    public int end () {
+
+    /**
+     * Get the end position.
+     *
+     * @return The end position.
+     */
+    public int getEnd () {
 	return this.end;
     };
 
-    public boolean storeOffsets () {
+
+    /**
+     * Set the flag for stored offsets.
+     *
+     * @param value Boolean value indicating that the term
+     *        contains stored offsets.
+     */
+    public void hasStoredOffsets (boolean value) {
+	this.storeOffsets = value;
+    };
+
+
+    /**
+     * Check if there are offsets stored.
+     *
+     * @return Boolean value indicating that the term
+     *         contains stored offsets.
+     */
+    public boolean hasStoredOffsets () {
 	return this.storeOffsets;
     };
 
-    public void storeOffsets (boolean value) {
-	this.storeOffsets = value;
-    };
 
     private void _fromString (String term) {
 	String[] termSurface = term.split("\\$", 2);
@@ -150,71 +241,52 @@
 
 	    // Payload has a type
 	    if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') {
-		ByteBuffer bb = ByteBuffer.allocate(8);
 
+		// Rewind bytebuffer
+		bb.rewind();
+
+		// Split payload at type marker boundaries
 		String[] pls = payloadStr.split("(?=<)|(?<=>)");
-		int l = 0;
 
-		for (int i = 1; i < pls.length;) {
+		l = 0; // Bytearray length
 
-		    // Resize the buffer
-		    if ((bb.capacity() - l) < 8) {
-			bb = ByteBuffer.allocate(bb.capacity() + 8).put(bb.array());
-			bb.position(l);
+		try {
+		    for (i = 1; i < pls.length;) {
+
+			// Resize the bytebuffer
+			if ((bb.capacity() - l) < 8) {
+			    bb = ByteBuffer.allocate(bb.capacity() + 8)
+				.put(bb.array());
+			    bb.position(l);
+			};
+
+			switch (pls[i]) {
+			case "<b>": // byte
+			    bb.put(Byte.parseByte(pls[i+1]));
+			    l++;
+			    break;
+			case "<s>": // short
+			    bb.putShort(Short.parseShort(pls[i+1]));
+			    l+=2;
+			    break;
+			case "<i>": // integer
+			    bb.putInt(Integer.parseInt(pls[i+1]));
+			    l+=4;
+			    break;
+			case "<l>": // long
+			    bb.putLong(Long.parseLong(pls[i+1]));
+			    l+=8;
+			    break;
+			};
+			i+=2;
 		    };
-		    switch (pls[i]) {
-		    case "<b>": // byte
-			bb.put(Byte.parseByte(pls[i+1]));
-			l++;
-			break;
-		    case "<s>":
-			bb.putShort(Short.parseShort(pls[i+1]));
-			l+=2;
-			break;
-		    case "<i>":
-			bb.putInt(Integer.parseInt(pls[i+1]));
-			l+=4;
-			break;
-		    case "<l>":
-			bb.putLong(Long.parseLong(pls[i+1]));
-			l+=8;
-			break;
-		    };
-		    i+=2;
+		
+		    byte[] bytes = new byte[l];
+		    System.arraycopy(bb.array(), 0, bytes, 0, l);
+		    this.payload = new BytesRef(bytes);
+		}
+		catch (Exception e) {
 		};
-		byte[] bytes = new byte[l];
-		System.arraycopy(bb.array(), 0, bytes, 0, l);
-		this.payload = new BytesRef(bytes);
-
-
-		/*
-		payloadStr = payloadStr.substring(3, payloadStr.length());
-		switch (type) {
-		case 'b':  // byte
-
-		    System.err.println("bbb");
-		    payloadBytes = ByteBuffer.allocate(1).put(new Byte(payloadStr)).array();
-		    break;
-		case 's':  // short
-		    payloadBytes = ByteBuffer.allocate(2).putShort(
-								   Short.parseShort(payloadStr)
-								   ).array();
-		    break;
-		case 'i': // integer
-		    payloadBytes = ByteBuffer.allocate(4).putInt(
-								 Integer.parseInt(payloadStr)
-								 ).array();
-		    break;
-		case 'l': // long
-		    payloadBytes = ByteBuffer.allocate(8).putLong(
-								  Long.parseLong(payloadStr)
-								  ).array();
-		    break;
-		};
-		TODO:
-		case '?': // arbitrary
-		    payloadStr = 
-		*/
 	    }
 
 	    // Payload is a string
@@ -222,18 +294,24 @@
 		this.payload = new BytesRef(payloadStr);
 	    };
 	};
-	String[] stringOffset = termSurface[0].split("\\#", 2);
-	if (stringOffset.length == 2) {
-	    String[] offset = stringOffset[1].split("\\-", 2);
+	
+	// Parse offset information
+	stringOffset = termSurface[0].split("\\#", 2);
 
+	if (stringOffset.length == 2) {
+
+	    // Split start and end position of the offset
+	    String[] offset = stringOffset[1].split("\\-", 2);
+   
+	    // Start and end is given
 	    if (offset.length == 2 && offset[0].length() > 0) {
-		this.start = Integer.parseInt(offset[0]);
-		this.end   = Integer.parseInt(offset[1]);
-	    /*
-	    }
-	    else {
-		this.storeOffsets(false);
-	    */
+		try {
+		    this.start = Integer.parseInt(offset[0]);
+		    this.end   = Integer.parseInt(offset[1]);
+
+		}
+		catch (NumberFormatException e) {
+		};
 	    };
 	};
 	this.term = stringOffset[0];
@@ -249,14 +327,14 @@
      * @see #toStringShort().
      */
     public String toString () {
+
 	StringBuilder sb = new StringBuilder(this.term);
+
 	if (this.start != this.end) {
-	    sb.append('#').append(this.start).append('-').append(this.end);
-	/*
-	}
-	else if (!this.storeOffsets()) {
-	    sb.append("#-");
-	*/
+	    sb.append('#')
+	      .append(this.start)
+	      .append('-')
+	      .append(this.end);
 	};
 
 	if (this.payload != null) {
@@ -265,7 +343,8 @@
 		sb.append(this.payload.utf8ToString());
 	    }
 	    catch (AssertionError e) {
-		sb.append("<?>").append(join(',', this.payload.toString().split(" ")));
+		sb.append("<?>")
+	          .append(this.payload.toString().replace(' ', ','));
 	    };
 	};
 
@@ -283,7 +362,14 @@
     public String toStringShort () {
 	StringBuilder sb = new StringBuilder(this.term);
 	if (this.payload != null) {
-	    sb.append('$').append(this.payload.utf8ToString());
+	    sb.append('$');
+	    try {
+		sb.append(this.payload.utf8ToString());
+	    }
+	    catch (AssertionError e) {
+		sb.append("<?>")
+		.append(this.payload.toString().replace(' ', ','));
+	    };
 	};
 	return sb.toString();
     };

diff --git a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java
index ff70996..9af4156 100644
--- a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java
+++ b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java

@@ -3,12 +3,6 @@
 import de.ids_mannheim.korap.analysis.MultiTerm;
 import java.util.*;
 
-/*
-  Todo:
-  - Always write offsets to payloads!
-  - Offsets can be overwritten!
-  - Check that terms are not ""!!!
-*/
 
 /**
  * @author Nils Diewald
@@ -19,8 +13,15 @@
     public int start, end = 0;
     public List<MultiTerm> terms;
 
+    private static short i = 0;
+
+    /**
+     * The constructor.
+     *
+     * @param terms Take at least one MultiTerm object for a token.
+     */
     public MultiTermToken (MultiTerm term, MultiTerm ... moreTerms) {
-	this.terms = new ArrayList<MultiTerm>();
+	this.terms = new ArrayList<MultiTerm>(16);
 
 	if (term.start != term.end) {
 	    this.start = term.start;
@@ -31,91 +32,143 @@
 	terms.add( term );
 
 	// Further elements on same position
-	for (int i = 0; i < moreTerms.length; i++) {
+	for (i = 0; i < moreTerms.length; i++) {
 	    term = moreTerms[i];
 	    term.posIncr = 0;
 	    terms.add(term);
 	};
     };
 
+
+    /**
+     * The constructor.
+     *
+     * @param prefix A term prefix.
+     * @param surface A surface string.
+     */
     public MultiTermToken (char prefix, String surface) {
-	this.terms = new ArrayList<MultiTerm>();
+	this.terms = new ArrayList<MultiTerm>(16);
 
 	MultiTerm term = new MultiTerm(prefix, surface);
 
-	if (term.start != term.end) {
-	    this.start = term.start;
-	    this.end = term.end;
-	};
+	this.setOffset(term.start, term.end);
 
 	// First word element
 	term.posIncr = 1;
 	terms.add( term );
     };
+    
 
-
+    /**
+     * The constructor.
+     *
+     * @param prefix At least one term surface string.
+     */
     public MultiTermToken (String surface, String ... moreTerms) {
-	this.terms = new ArrayList<MultiTerm>();
+	this.terms = new ArrayList<MultiTerm>(16);
 
 	MultiTerm term = new MultiTerm(surface);
 
-	if (term.start != term.end) {
-	    this.start = term.start;
-	    this.end = term.end;
-	};
+	this.setOffset(term.start, term.end);
 
 	// First word element
 	term.posIncr = 1;
 	terms.add( term );
 
-
 	// Further elements on same position
-	for (int i = 0; i < moreTerms.length; i++) {
-
+	for (i = 0; i < moreTerms.length; i++) {
 	    term = new MultiTerm( moreTerms[i] );
+	    this.setOffset(term.start, term.end);
 	    term.posIncr = 0;
 	    terms.add(term);
 	};
     };
 
+    
+    /**
+     * Add a new term to the MultiTermToken.
+     *
+     * @param mt A MultiTerm.
+     */
     public void add (MultiTerm mt) {
+	mt.posIncr = 0;
+	this.setOffset(mt.start, mt.end);
 	terms.add(mt);
     };
 
+
+    /**
+     * Add a new term to the MultiTermToken.
+     *
+     * @param term A surface string.
+     */
     public void add (String term) {
+	if (term.length() == 0)
+	    return;
 	MultiTerm mt = new MultiTerm(term);
+	this.setOffset(mt.start, mt.end);
 	mt.posIncr = 0;
 	terms.add(mt);
     };
 
+    /**
+     * Add a new term to the MultiTermToken.
+     *
+     * @param prefix A prefix character for the surface string.
+     * @param term A surface string.
+     */
     public void add (char prefix, String term) {
+	if (term.length() == 0)
+	    return;
 	MultiTerm mt = new MultiTerm(prefix, term);
+	this.setOffset(mt.start, mt.end);
 	mt.posIncr = 0;
 	terms.add(mt);
     };
 
-    public void offset (int start, int end) {
-	this.start = start;
-	this.end   = end;
+
+    /**
+     * Sets the offset information of the MultiTermToken.
+     *
+     * @param start The character position of the token start.
+     * @param end The character position of the token end.
+     */
+    public void setOffset (int start, int end) {
+	if (start != end) {
+	    this.start = (this.start == 0 || start < this.start) ? start : this.start;
+	    this.end   = end > this.end ? end : this.end;
+	};
     };
 
+    /**
+     * Serialize the MultiTermToken to a string.
+     *
+     * @return A string representation of the token, with leading offset information.
+     */
     public String toString () {
 	StringBuffer sb = new StringBuffer();
 
 	sb.append('[');
 	if (this.start != this.end) {
-	    sb.append('(').append(this.start).append('-').append(this.end).append(')');
+	    sb.append('(')
+	      .append(this.start)
+	      .append('-')
+	      .append(this.end)
+	      .append(')');
 	};
 
-	int i = 0;
+	i = 0;
 	for (; i < this.terms.size() - 1; i++) {
-	    sb.append(this.terms.get(i).toStringShort()).append('|');
+	    sb.append(this.terms.get(i).toString()).append('|');
 	};
-	sb.append(this.terms.get(i).toStringShort()).append(']');
+	sb.append(this.terms.get(i).toString()).append(']');
 
 	return sb.toString();
     };
 
+    /**
+     * Return the number of MultiTerms in the MultiTermToken.
+     */
     public int size () {
 	return this.terms.size();
     };

diff --git a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java
index 66462d2..b3d2a6f 100644
--- a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java
+++ b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java

@@ -10,7 +10,6 @@
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-// import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 
@@ -21,13 +20,13 @@
 
 /*
   Todo:
-  - Do not use offsetAttr!
-#  - Payload is [4ByteStartOffset][14BitEndOffset-startOffset][1BitBooleanIfSpan][1BitBooleanIfOpen]
-  - Payload is [4ByteOffsetStart][4ByteOffsetStart]
+   - !Payload is [4ByteStartOffset][14BitEndOffset-startOffset][1BitBooleanIfSpan][1BitBooleanIfOpen]
+   - Payload is [4ByteOffsetStart][4ByteOffsetStart]
 */
 
 /**
  * @author Nils Diewald
+ * @version 0.3
  *
  * MultiTermTokenStream extends Lucenes TokenStream class to work with MultiTermTokens.
  *
@@ -35,181 +34,240 @@
  */
 public class MultiTermTokenStream extends TokenStream {
     private CharTermAttribute charTermAttr;
-    //     private OffsetAttribute offsetAttr;
     private PositionIncrementAttribute posIncrAttr;
     private PayloadAttribute payloadAttr;
 
-    private static Pattern pattern = Pattern.compile("\\[(\\(([0-9]+)-([0-9]+)\\))?([^\\]]+?)\\]");
-
-    private List<MultiTermToken> multiTermTokens;
-    private int mttIndex = 0;
-    private int mtIndex  = 0;
-    //    private TokenTextGenerator ttGen = new TokenTextGenerator();
-
-    private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);
+    private static final Pattern pattern = Pattern.compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]");
 
     // This advices the java compiler to ignore all loggings
     public static final boolean DEBUG = false;
+    private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);
 
+    private List<MultiTermToken> multiTermTokens;
+    private int mttIndex = 0, mtIndex  = 0;
+    private static short i = 0;
+
+
+    /**
+     * The empty Constructor.
+     */
     public MultiTermTokenStream () {
-	//	this.offsetAttr   = this.addAttribute(OffsetAttribute.class);
-        this.charTermAttr = this.addAttribute(CharTermAttribute.class);
-        this.posIncrAttr  = this.addAttribute(PositionIncrementAttribute.class);
-	this.payloadAttr = this.addAttribute(PayloadAttribute.class);
-	this.multiTermTokens  = new ArrayList<MultiTermToken>();
-
-	/*
-        if (!indexTokens.isEmpty()){
-            indexTokens.get(indexTokens.size() - 1).setIncrement(false);
-        };
-	*/
+        this.charTermAttr    = this.addAttribute(CharTermAttribute.class);
+        this.posIncrAttr     = this.addAttribute(PositionIncrementAttribute.class);
+	this.payloadAttr     = this.addAttribute(PayloadAttribute.class);
+	this.multiTermTokens = new ArrayList<MultiTermToken>(100);
     };
 
+
+    /**
+     * The Constructor.
+     *
+     * @param stream The MultiTermTokenStream as a string representation.
+     */
     public MultiTermTokenStream (String stream) {
 	this();
 
-	int pos = 0;
-
 	Matcher matcher = pattern.matcher(stream);
 
 	while (matcher.find()) {
 
-	    String[] seg = matcher.group(4).split("\\|");
+	    String[] seg = matcher.group(1).split("\\|");
 	    MultiTermToken mtt = new MultiTermToken( seg[0] );
 
-	    if (matcher.group(2) != null)
-		mtt.start = Integer.parseInt(matcher.group(2));
-
-	    if (matcher.group(3) != null)
-		mtt.end = Integer.parseInt(matcher.group(3));
-
-	    for (int i = 1; i < seg.length; i++)
+	    for (i = 1; i < seg.length; i++)
 		mtt.add(seg[i]);
 
 	    this.addMultiTermToken(mtt);
 	};
     };
 
+
+    /**
+     * Add a MultiTermToken to the end of the MultiTermTokenStream.
+     *
+     * @param mtt A MultiTermToken.
+     */
     public void addMultiTermToken (MultiTermToken mtt) {
 	this.multiTermTokens.add(mtt);
     };
 
+
+    /**
+     * Add a MultiTermToken by means of MultiTerms to the end of
+     * the MultiTermTokenStream.
+     *
+     * @param term At least one MultiTerm.
+     */
     public void addMultiTermToken (MultiTerm term, MultiTerm ... moreTerms) {
 	this.addMultiTermToken(new MultiTermToken(term, moreTerms));
     };
 
+
+    /**
+     * Add a MultiTermToken by means of a single MultiTerm to the end of
+     * the MultiTermTokenStream.
+     *
+     * @param prefix A prefix character of a surface form of a MultiTerm.
+     * @param surface A surface string of a MultiTerm.
+     */
     public void addMultiTermToken (char prefix, String surface) {
 	this.addMultiTermToken(new MultiTermToken(prefix, surface));
     };
 
+
+    /**
+     * Add a MultiTermToken by means of a a series of surface strings
+     * to the end of the MultiTermTokenStream.
+     *
+     * @param surface At least one surface string of a MultiTerm.
+     */
     public void addMultiTermToken (String surface, String ... moreTerms) {
 	this.addMultiTermToken(new MultiTermToken(surface, moreTerms));
     };
 
+
+    /**
+     * Add meta information to the MultiTermTokenStream.
+     *
+     * @param key A string for denoting the meta information.
+     * @param value The value of the meta key as a string.
+     */
     public void addMeta (String key, String value) {
 	MultiTerm mt = new MultiTerm('-', key);
-	//	mt.storeOffsets(false);
-	mt.payload(value);
+	mt.setPayload(value);
 	this.multiTermTokens.get(0).add(mt);
     };
 
+
+    /**
+     * Add meta information to the MultiTermTokenStream.
+     *
+     * @param key A string for denoting the meta information.
+     * @param value The value of the meta key as a byte array.
+     */
     public void addMeta (String key, byte[] value) {
 	MultiTerm mt = new MultiTerm('-', key);
-	//	mt.storeOffsets(false);
-	mt.payload(value);
+	mt.setPayload(value);
 	this.multiTermTokens.get(0).add(mt);
     };
 
 
+    /**
+     * Add meta information to the MultiTermTokenStream.
+     *
+     * @param key A string for denoting the meta information.
+     * @param value The value of the meta key as a short value.
+     */
     public void addMeta (String key, short value) {
 	MultiTerm mt = new MultiTerm('-', key);
-	//	mt.storeOffsets(false);
-	mt.payload(value);
+	mt.setPayload(value);
 	this.multiTermTokens.get(0).add(mt);
     };
 
+
+    /**
+     * Add meta information to the MultiTermTokenStream.
+     *
+     * @param key A string for denoting the meta information.
+     * @param value The value of the meta key as a long value.
+     */
     public void addMeta (String key, long value) {
 	MultiTerm mt = new MultiTerm('-', key);
-	//	mt.storeOffsets(false);
-	mt.payload(value);
+	mt.setPayload(value);
 	this.multiTermTokens.get(0).add(mt);
     };
 
+
+    /**
+     * Add meta information to the MultiTermTokenStream.
+     *
+     * @param key A string for denoting the meta information.
+     * @param value The value of the meta key as a integer value.
+     */
     public void addMeta (String key, int value) {
 	MultiTerm mt = new MultiTerm('-', key);
-	//	mt.storeOffsets(false);
-	mt.payload(value);
+	mt.setPayload(value);
 	this.multiTermTokens.get(0).add(mt);
     };
 
+
+    /**
+     * Increment the token in the MultiTermTokenStream.
+     * This overrides the function in Lucene's TokenStream.
+     */
     @Override
     public final boolean incrementToken() throws IOException {
 	this.payloadAttr.setPayload(null);
 
+	// Last token reached
 	if (this.multiTermTokens.size() == this.mttIndex) {
 	    reset();
 	    return false;
 	};
 
+	// Get current token
 	MultiTermToken mtt = this.multiTermTokens.get( this.mttIndex );
 
+	// Last term reached
 	if (mtt.terms.size() == this.mtIndex) {
 	    this.mtIndex = 0;
 	    this.mttIndex++;
+
+	    // Last term of last token reached
 	    if (this.multiTermTokens.size() == this.mttIndex) {
 		reset();
 		return false;
 	    }
+
+	    // Get last token
 	    else {
 		mtt = this.multiTermTokens.get( this.mttIndex );
 	    };
 	};
 
+	// Get current term
 	MultiTerm mt = mtt.terms.get(this.mtIndex);
 
-	// Get the current index token
-
 	// Set the relative position to the former term
         posIncrAttr.setPositionIncrement( mt.posIncr );
         charTermAttr.setEmpty();
 	charTermAttr.append( mt.term );
 
 	BytesRef payload = new BytesRef();
+
+	// There is offset information
 	if (mt.start != mt.end) {
 	    if (DEBUG)
 		log.trace("MultiTerm with payload offset: {}-{}", mt.start, mt.end);
+
+	    // Add offsets to BytesRef payload
 	    payload.append(new BytesRef(int2byte(mt.start)));
 	    payload.append(new BytesRef(int2byte(mt.end)));
-	    /*
-	      }
-	      else if (mtt.start != mtt.end) {
-	      payload.append(new BytesRef(int2byte(mtt.start)));
-	      payload.append(new BytesRef(int2byte(mtt.end)));
-	    */
 	};
 
-	// Payload
+	// There is payload in the MultiTerm
 	if (mt.payload != null) {
-	    payload.append(mt.payload());
+	    payload.append(mt.payload);
 	    if (DEBUG)
 		log.trace("Create payload[1] {}", payload.toString());
 	};
 
+	// There is payload in the current token to index
 	if (payload.length > 0) {
+	    payloadAttr.setPayload(payload);
 	    if (DEBUG)
 		log.trace("Set payload[2] {}", payload.toString());
-	    payloadAttr.setPayload(payload);
 	};
 
-	if (log.isTraceEnabled()) {
+	if (DEBUG) {
 	    StringBuilder sb = new StringBuilder("Index: [");
 	    sb.append(mt.term);
 	    if (payload.length > 0)
 		sb.append('$').append(payload.toString());
 	    sb.append(']');
 	    sb.append(" with increment ").append(mt.posIncr);
-	    if (DEBUG)
-		log.trace(sb.toString());
+
+	    log.trace(sb.toString());
 	};
 
 	this.mtIndex++;

diff --git a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTerm.java b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTerm.java
index 9e33133..c6929ec 100644
--- a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTerm.java
+++ b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTerm.java

@@ -68,10 +68,12 @@
     @Test
     public void multiTermStringPayloadType2 () {
 	MultiTerm mt = new MultiTerm();
-	mt.term("beispiel");
-	mt.start(40);
-	mt.end(50);
-	mt.payload((int) 4000);
+	mt.setTerm("beispiel");
+	mt.setStart(40);
+	assertEquals(mt.getStart(), mt.start);
+	mt.setEnd(50);
+	assertEquals(mt.getEnd(), mt.end);
+	mt.setPayload((int) 4000);
 	assertEquals("beispiel#40-50$<?>[0,0,f,a0]", mt.toString());
     };
 
@@ -99,4 +101,20 @@
 	mt = new MultiTerm("example$<l>4000<b>120");
 	assertEquals("example$<?>[0,0,0,0,0,0,f,a0,78]", mt.toString());
     };
+
+    @Test
+    public void multiTermStringFail () {
+	MultiTerm mt = new MultiTerm("example#56-66");
+	assertEquals(56, mt.getStart());
+	assertEquals(66,mt.getEnd());
+
+	mt = new MultiTerm("example#56-66$<i>a");
+	assertEquals(56, mt.getStart());
+	assertEquals(66, mt.getEnd());
+
+	mt = new MultiTerm("example#56$<i>a");
+	assertEquals(mt.getPayload(), null);
+	assertEquals(mt.getStart(), 0);
+	assertEquals(mt.getEnd(), 0);
+    };
 };

diff --git a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java
index ab071c2..72263ba 100644
--- a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java
+++ b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java

@@ -19,20 +19,20 @@
 	mtt.add("b:banane");
 	assertEquals("[t:test|a:abbruch|b:banane]", mtt.toString());
 	mtt.add("c:chaos#21-26");
-	assertEquals("[t:test|a:abbruch|b:banane|c:chaos]", mtt.toString());
-	mtt.add("d:dadaismus#21-26$vergleich");
-	assertEquals("[t:test|a:abbruch|b:banane|c:chaos|d:dadaismus$vergleich]", mtt.toString());
+	assertEquals("[(21-26)t:test|a:abbruch|b:banane|c:chaos#21-26]", mtt.toString());
+	mtt.add("d:dadaismus#21-28$vergleich");
+	assertEquals("[(21-28)t:test|a:abbruch|b:banane|c:chaos#21-26|d:dadaismus#21-28$vergleich]", mtt.toString());
     };
 
     @Test
     public void multiTermTokenOffsets () {
 	MultiTermToken mtt = new MultiTermToken("t:test#23-27");
-	assertEquals("[(23-27)t:test]", mtt.toString());
+	assertEquals("[(23-27)t:test#23-27]", mtt.toString());
 	mtt.add("b:baum#34-45");
-	assertEquals("[(23-27)t:test|b:baum]", mtt.toString());
+	assertEquals("[(23-45)t:test#23-27|b:baum#34-45]", mtt.toString());
 	mtt.add("c:cannonball#34-45$tatsache");
-	assertEquals("[(23-27)t:test|b:baum|c:cannonball$tatsache]", mtt.toString());
+	assertEquals("[(23-45)t:test#23-27|b:baum#34-45|c:cannonball#34-45$tatsache]", mtt.toString());
 	assertEquals(23, mtt.start);
-	assertEquals(27, mtt.end);
+	assertEquals(45, mtt.end);
     };
 };

diff --git a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java
index b80ded6..21b8cca 100644
--- a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java
+++ b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java
Binary files differ

diff --git a/src/test/java/de/ids_mannheim/korap/benchmark/TestBenchmarkElementSpans.java b/src/test/java/de/ids_mannheim/korap/benchmark/TestBenchmarkElementSpans.java
index cdaf9fe..7bfed40 100644
--- a/src/test/java/de/ids_mannheim/korap/benchmark/TestBenchmarkElementSpans.java
+++ b/src/test/java/de/ids_mannheim/korap/benchmark/TestBenchmarkElementSpans.java

@@ -150,6 +150,12 @@
 	// 10 times / 350 docs:
 	// 36.26158006 seconds
 	// 32.52575097 seconds
+	// 31.818091536 seconds
+	// 32.055321123 seconds
+	// 32.32125959 seconds
+	// 31.726277979 seconds
+	// 31.65826188 seconds
+	// 31.287057537 seconds
     };
 
 

diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index 65d9245..51560d6 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java

@@ -130,6 +130,19 @@
 		     "... [{f/m:acht:b}{f/m:neun:a}] ...",
 		     km.getSnippetBrackets());
 
+
+	/*
+	km = ki.getMatchInfo("match-c1!d1-p7-9(0)8-8(2)7-8",
+			     "tokens",
+			     "f",
+			     null,
+			     false,
+			     false);
+
+	System.err.println(km.toJSON());
+	*/
+
+	
 	km = ki.getMatchInfo("match-c1!d1-p7-9(0)8-8(2)7-8",
 			     "tokens",
 			     "f",
commit	d0d6feb6cd96c233fc69cae13b3a3dfbf5993414	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Wed Feb 26 18:51:08 2014 +0000
committer	Nils Diewald <nils@diewald-online.de>	Wed Feb 26 18:51:08 2014 +0000
tree	e17d45a7e29c188589ee46f90d89de785f82f142
parent	b76d498b89bc01508669d5ce107ce15729a863c9 [diff]