Documentation improvements and various dependency updates
diff --git a/Changes b/Changes
index 1b2dba8..c6d33fa 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,10 @@
-0.49.3 2014-02-03
+0.49.4 2015-02-04
+ - [documentation] Improved documentation for API classes (diewald)
+ - [performance] Updated Lucene dependency from 4.5.1 to 4.10.3,
+ Updated Jackson dependency from 2.4.0 to 2.4.4,
+ Updated Jersey dependency from 2.4.1 to 2.15 (diewald)
+
+0.49.3 2015-02-03
- [documentation] Improved documentation for API classes (diewald)
- [documentation] Improved documentation for various queries (margaretha)
- [feature] Added deserialization of SpanSubSpanQueries (margaretha,diewald)
diff --git a/pom.xml b/pom.xml
index bbcfb33..cdbbbd6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -23,7 +23,7 @@
<groupId>KorAP-modules</groupId>
<artifactId>KorAP-lucene-index</artifactId>
- <version>0.49.3</version>
+ <version>0.49.4</version>
<packaging>jar</packaging>
<name>KorAP-lucene-index</name>
@@ -47,7 +47,7 @@
</developers>
<properties>
- <jersey.version>2.4.1</jersey.version>
+ <jersey.version>2.15</jersey.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
@@ -103,7 +103,7 @@
<artifactId>lucene-core</artifactId>
<groupId>org.apache.lucene</groupId>
<type>jar</type>
- <version>4.5.1</version>
+ <version>4.10.3</version>
</dependency>
<!-- Lucene queryparser dependency -->
@@ -111,7 +111,7 @@
<artifactId>lucene-queryparser</artifactId>
<groupId>org.apache.lucene</groupId>
<type>jar</type>
- <version>4.5.1</version>
+ <version>4.10.3</version>
</dependency>
<!-- Lucene analyzers dependency -->
@@ -119,7 +119,7 @@
<artifactId>lucene-analyzers-common</artifactId>
<groupId>org.apache.lucene</groupId>
<type>jar</type>
- <version>4.3.1</version>
+ <version>4.10.3</version>
</dependency>
<dependency>
@@ -159,17 +159,17 @@
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
- <version>2.4.0</version>
+ <version>2.4.4</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
- <version>2.4.0</version>
+ <version>2.4.4</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
- <version>2.4.0</version>
+ <version>2.4.4</version>
</dependency>
<!--
Temporarily disable @Experimental annotation
diff --git a/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java b/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java
index f9ff3d6..1f72e8d 100644
--- a/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java
+++ b/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java
@@ -5,12 +5,35 @@
import java.nio.ByteBuffer;
import java.util.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
- * @author Nils Diewald
- * @version 0.3
+ * A MultiTerm represents a single term (e.g. a word, an annotation, a relation)
+ * that can be part of a MultiTermToken.
*
- * MultiTerm represents a term in a MultiTermToken.
+ * A MultiTerm consists of a term representation string, optional character offset
+ * information that matches the term to the character stream of the input text,
+ * and an arbitrary payload.
+ *
+ * There is a simple string representation of MultiTerms supported:
+ * The string is the first sequence of characters.
+ * Offsets are written as an appended and dash separated pair of integers.
+ * Payloads are written following a dollar sign.
+ * Payload segments can be typed as being a short (s), an integer (i), or a long (l)
+ * value in leading angular brackets.
+ * All other (untyped) payloads are treated as being UTF-8 characer sequences.
+ *
+ * <blockquote><pre>
+ * MultiTerm test1 = new MultiTerm("test");
+ * MultiTerm test2 = new MultiTerm("test#0-4");
+ * MultiTerm test3 = new MultiTerm("test#0-4$Example");
+ * MultiTerm test4 = new MultiTerm("test#0-4$<i>1278");
+ * </pre></blockquote>
+ *
+ * <strong>Warning</strong>: Strings that are malformed fail silently!
+ *
+ * @author diewald
*/
public class MultiTerm {
public int start, end = 0;
@@ -24,141 +47,71 @@
private static short i, l;
+ // This advices the java compiler to ignore all loggings
+ public static final boolean DEBUG = false;
+ private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);
+
/**
- * The constructor.
- *
- * @param term The term surface.
- Offsets can be written as an appended and dash separated pair of integers,
- payloads can be written following a dollar sign.
- payloads can be typed as being a short (s), an integer (i), or a long (l)
- in leading angular brackets. All other payloads are treated as being UTF-8
- characer sequences.
-
- Examples:
- MultiTerm test = new MultiTerm("test");
- MultiTerm test = new MultiTerm("test#0-4");
- MultiTerm test = new MultiTerm("test#0-4$Example");
- MultiTerm test = new MultiTerm("test#0-4$<i>1278");
-
- Strings that are malformed fail silently.
- */
- public MultiTerm (String term) {
- _fromString(term);
- };
-
-
- /**
- * The constructor with a separated prefix.
- * new MultiTerm('a', "bcd") is equivalent to
- * new MultiTerm("a:bcd");
- *
- * @param prefix A special prefix for the term.
- * @param term The term surface.
- *
- * @see #MultiTerm(String)
- */
- public MultiTerm (char prefix, String term) {
- StringBuilder sb = new StringBuilder();
- _fromString(sb.append(prefix).append(':').append(term).toString());
- };
-
- /**
- * The empty constructor.
+ * Construct a new MultiTerm object.
*/
public MultiTerm () {
- this.term = "";
+ this.term = "";
+ };
+
+
+ /**
+ * Construct a new MultiTerm object.
+ *
+ * @param term The term surface (see synopsis).
+ */
+ public MultiTerm (String term) {
+ _fromString(term);
};
/**
- * Sets the term value.
+ * Construct a new MultiTerm object.
*
- * @param term The term as a string
+ * In addition to the normal surface representation,
+ * this supports a prefix notation.
+ * The following expressions are equal:
+ *
+ * <blockquote><pre>
+ * MultiTerm test1 = new MultiTerm('a', "bcd");
+ * MultiTerm test2 = new MultiTerm("a:bcd");
+ * </pre></blockquote>
+ *
+ * @param prefix A special prefix for the term.
+ * @param term The term surface (see synopsis).
*/
- public void setTerm (String term) {
- this.term = term;
+ public MultiTerm (char prefix, String term) {
+ StringBuilder sb = new StringBuilder();
+ _fromString(sb.append(prefix).append(':').append(term).toString());
};
/**
- * Returns the term value.
+ * Get the term value of the MultiTerm.
*
- * @return The term value.
+ * @return The term as a string.
*/
public String getTerm () {
- return this.term;
+ return this.term;
};
-
+
/**
- * Set the payload as a byte value.
+ * Set the term value of the MultiTerm.
*
- * @param pl The payload.
+ * @param term The term as a string.
+ * @return The {@link MultIterm} object for chaining.
*/
- public void setPayload (Byte pl) {
- this.payload = new BytesRef( ByteBuffer.allocate(1).put(pl).array());
+ public MultiTerm setTerm (String term) {
+ this.term = term;
+ return this;
};
-
- /**
- * Set the payload as a short value.
- *
- * @param pl The payload.
- */
- public void setPayload (short pl) {
- this.payload = new BytesRef( ByteBuffer.allocate(2).putShort(pl).array());
- };
-
-
- /**
- * Set the payload as an integer value.
- *
- * @param pl The payload.
- */
- public void setPayload (int pl) {
- this.payload = new BytesRef( ByteBuffer.allocate(4).putInt(pl).array());
- };
-
-
- /**
- * Set the payload as a long value.
- *
- * @param pl The payload.
- */
- public void setPayload (long pl) {
- this.payload = new BytesRef( ByteBuffer.allocate(8).putLong(pl).array());
- };
-
-
- /**
- * Set the payload as a string value.
- *
- * @param pl The payload.
- */
- public void setPayload (String pl) {
- this.payload = new BytesRef(pl);
- };
-
-
- /**
- * Set the payload as a byte array.
- *
- * @param pl The payload.
- */
- public void setPayload (byte[] pl) {
- this.payload = new BytesRef(pl);
- };
-
-
- /**
- * Set the payload as a BytesRef.
- *
- * @param pl The payload.
- */
- public void setPayload (BytesRef pl) {
- this.payload = pl;
- };
/**
* Get the payload.
@@ -166,17 +119,91 @@
* @return The payload as a BytesRef.
*/
public BytesRef getPayload () {
- return this.payload;
+ return this.payload;
+ };
+
+
+ /**
+ * Set the payload as a {@link Byte} value.
+ *
+ * @param pl The payload.
+ * @return The {@link MultiTerm} object for chaining.
+ */
+ public MultiTerm setPayload (Byte pl) {
+ this.payload = new BytesRef( ByteBuffer.allocate(1).put(pl).array());
+ return this;
+ };
+
+
+ /**
+ * Set the payload as a short value.
+ *
+ * @param pl The payload.
+ * @return The {@link MultiTerm} object for chaining.
+ */
+ public MultiTerm setPayload (short pl) {
+ this.payload = new BytesRef( ByteBuffer.allocate(2).putShort(pl).array());
+ return this;
};
/**
- * Set the start position of the term.
+ * Set the payload as an integer value.
*
- * @param The start position.
+ * @param pl The payload.
+ * @return The {@link MultiTerm} object for chaining.
*/
- public void setStart (int value) {
- this.start = value;
+ public MultiTerm setPayload (int pl) {
+ this.payload = new BytesRef( ByteBuffer.allocate(4).putInt(pl).array());
+ return this;
+ };
+
+
+ /**
+ * Set the payload as a long value.
+ *
+ * @param pl The payload.
+ * @return The {@link MultiTerm} object for chaining.
+ */
+ public MultiTerm setPayload (long pl) {
+ this.payload = new BytesRef( ByteBuffer.allocate(8).putLong(pl).array());
+ return this;
+ };
+
+
+ /**
+ * Set the payload as a string value.
+ *
+ * @param pl The payload.
+ * @return The {@link MultiTerm} object for chaining.
+ */
+ public MultiTerm setPayload (String pl) {
+ this.payload = new BytesRef(pl);
+ return this;
+ };
+
+
+ /**
+ * Set the payload as a byte array.
+ *
+ * @param pl The payload.
+ * @return The {@link MultiTerm} object for chaining.
+ */
+ public MultiTerm setPayload (byte[] pl) {
+ this.payload = new BytesRef(pl);
+ return this;
+ };
+
+
+ /**
+ * Set the payload as a {@link BytesRef} object.
+ *
+ * @param pl The payload.
+ * @return The {@link MultiTerm} object for chaining.
+ */
+ public MultiTerm setPayload (BytesRef pl) {
+ this.payload = pl;
+ return this;
};
@@ -186,17 +213,19 @@
* @return The start position.
*/
public int getStart () {
- return this.start;
+ return this.start;
};
/**
- * Set the end position of the term.
+ * Set the start position.
*
- * @param The end position.
+ * @param start The start position.
+ * @return The {@link MultiTerm} object for chaining.
*/
- public void setEnd (int value) {
- this.end = value;
+ public MultiTerm setStart (int start) {
+ this.start = start;
+ return this;
};
@@ -206,18 +235,19 @@
* @return The end position.
*/
public int getEnd () {
- return this.end;
+ return this.end;
};
/**
- * Set the flag for stored offsets.
+ * Set the end position.
*
- * @param value Boolean value indicating that the term
- * contains stored offsets.
+ * @param end The end position.
+ * @return The {@link MultiTerm} object for chaining.
*/
- public void hasStoredOffsets (boolean value) {
- this.storeOffsets = value;
+ public MultiTerm setEnd (int end) {
+ this.end = end;
+ return this;
};
@@ -228,98 +258,25 @@
* contains stored offsets.
*/
public boolean hasStoredOffsets () {
- return this.storeOffsets;
- };
-
-
- private void _fromString (String term) {
- String[] termSurface = term.split("\\$", 2);
-
- // Payload is given
- if (termSurface.length == 2) {
- String payloadStr = termSurface[1];
-
- // Payload has a type
- if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') {
-
- // Rewind bytebuffer
- bb.rewind();
-
- // Split payload at type marker boundaries
- String[] pls = payloadStr.split("(?=<)|(?<=>)");
-
- l = 0; // Bytearray length
-
- try {
- for (i = 1; i < pls.length;) {
-
- // Resize the bytebuffer
- if ((bb.capacity() - l) < 8) {
- bb = ByteBuffer.allocate(bb.capacity() + 8)
- .put(bb.array());
- bb.position(l);
- };
-
- switch (pls[i]) {
- case "<b>": // byte
- bb.put(Byte.parseByte(pls[i+1]));
- l++;
- break;
- case "<s>": // short
- bb.putShort(Short.parseShort(pls[i+1]));
- l+=2;
- break;
- case "<i>": // integer
- bb.putInt(Integer.parseInt(pls[i+1]));
- l+=4;
- break;
- case "<l>": // long
- bb.putLong(Long.parseLong(pls[i+1]));
- l+=8;
- break;
- };
- i+=2;
- };
-
- byte[] bytes = new byte[l];
- System.arraycopy(bb.array(), 0, bytes, 0, l);
- this.payload = new BytesRef(bytes);
- }
- catch (Exception e) {
- };
- }
-
- // Payload is a string
- else {
- this.payload = new BytesRef(payloadStr);
- };
- };
-
- // Parse offset information
- stringOffset = termSurface[0].split("\\#", 2);
-
- if (stringOffset.length == 2) {
-
- // Split start and end position of the offset
- String[] offset = stringOffset[1].split("\\-", 2);
-
- // Start and end is given
- if (offset.length == 2 && offset[0].length() > 0) {
- try {
- this.start = Integer.parseInt(offset[0]);
- this.end = Integer.parseInt(offset[1]);
-
- }
- catch (NumberFormatException e) {
- };
- };
- };
- this.term = stringOffset[0];
+ return this.storeOffsets;
};
/**
- * Represent the MultiTerm as a string.
+ * Set the flag for stored offsets, in case they are relevant.
+ *
+ * @param value Boolean value indicating that the term
+ * contains stored offsets.
+ * @return The {@link MultiTerm} object for chaining.
+ */
+ public MultiTerm hasStoredOffsets (boolean value) {
+ this.storeOffsets = value;
+ return this;
+ };
+
+
+ /**
+ * Represent the MultiTerm as a string (see synopsis).
* Offsets are attached following a hash sign,
* payloads are attached following a dollar sign.
* All payloads are written as UTF-8 character sequences.
@@ -327,50 +284,144 @@
* @see #toStringShort().
*/
public String toString () {
+ StringBuilder sb = new StringBuilder(this.term);
+ if (this.start != this.end) {
+ sb.append('#')
+ .append(this.start)
+ .append('-')
+ .append(this.end);
+ };
- StringBuilder sb = new StringBuilder(this.term);
+ if (this.payload != null) {
+ sb.append('$');
+ try {
+ sb.append(this.payload.utf8ToString());
+ }
+ catch (AssertionError e) {
+ sb.append("<?>")
+ .append(this.payload.toString().replace(' ', ','));
+ };
+ };
- if (this.start != this.end) {
- sb.append('#')
- .append(this.start)
- .append('-')
- .append(this.end);
- };
-
- if (this.payload != null) {
- sb.append('$');
- try {
- sb.append(this.payload.utf8ToString());
- }
- catch (AssertionError e) {
- sb.append("<?>")
- .append(this.payload.toString().replace(' ', ','));
- };
- };
-
- return sb.toString();
+ return sb.toString();
};
+
/**
* Represent the MultiTerm as a string.
* Payloads are attached following a dollar sign.
* All payloads are written as UTF-8 character sequences.
* Offsets are neglected.
+ *
+ * Offsets are ignored.
*
* @see #toString().
*/
public String toStringShort () {
- StringBuilder sb = new StringBuilder(this.term);
- if (this.payload != null) {
- sb.append('$');
- try {
- sb.append(this.payload.utf8ToString());
- }
- catch (AssertionError e) {
- sb.append("<?>")
- .append(this.payload.toString().replace(' ', ','));
- };
- };
- return sb.toString();
+ StringBuilder sb = new StringBuilder(this.term);
+ if (this.payload != null) {
+ sb.append('$');
+ try {
+ sb.append(this.payload.utf8ToString());
+ }
+ catch (AssertionError e) {
+ sb.append("<?>")
+ .append(this.payload.toString().replace(' ', ','));
+ };
+ };
+ return sb.toString();
+ };
+
+
+ /*
+ * Deserialize MultiTerm from string representation.
+ */
+ private void _fromString (String term) {
+ String[] termSurface = term.split("\\$", 2);
+
+ // Payload is given
+ if (termSurface.length == 2) {
+ String payloadStr = termSurface[1];
+
+ // Payload has a type
+ if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') {
+
+ // Rewind bytebuffer
+ bb.rewind();
+
+ // Split payload at type marker boundaries
+ String[] pls = payloadStr.split("(?=<)|(?<=>)");
+
+ l = 0; // Bytearray length
+
+ try {
+ for (i = 1; i < pls.length;) {
+
+ // Resize the bytebuffer
+ if ((bb.capacity() - l) < 8) {
+ bb = ByteBuffer.allocate(bb.capacity() + 8).
+ put(bb.array());
+ bb.position(l);
+ };
+
+ switch (pls[i]) {
+ case "<b>": // byte
+ bb.put(Byte.parseByte(pls[i+1]));
+ l++;
+ break;
+ case "<s>": // short
+ bb.putShort(Short.parseShort(pls[i+1]));
+ l+=2;
+ break;
+ case "<i>": // integer
+ bb.putInt(Integer.parseInt(pls[i+1]));
+ l+=4;
+ break;
+ case "<l>": // long
+ bb.putLong(Long.parseLong(pls[i+1]));
+ l+=8;
+ break;
+ };
+ i+=2;
+ };
+
+ byte[] bytes = new byte[l];
+ System.arraycopy(bb.array(), 0, bytes, 0, l);
+ this.payload = new BytesRef(bytes);
+ }
+ catch (Exception e) {
+ if (DEBUG)
+ log.warn(e.getMessage());
+ };
+ }
+
+ // Payload is a string
+ else {
+ this.payload = new BytesRef(payloadStr);
+ };
+ };
+
+ // Parse offset information
+ stringOffset = termSurface[0].split("\\#", 2);
+
+ if (stringOffset.length == 2) {
+
+ // Split start and end position of the offset
+ String[] offset = stringOffset[1].split("\\-", 2);
+
+ // Start and end is given
+ if (offset.length == 2 && offset[0].length() > 0) {
+ try {
+ this.start = Integer.parseInt(offset[0]);
+ this.end = Integer.parseInt(offset[1]);
+
+ }
+ catch (NumberFormatException e) {
+ if (DEBUG)
+ log.warn("Offset not a number: {}", term);
+ };
+ };
+ };
+ this.term = stringOffset[0];
};
};
diff --git a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java
index 9af4156..5332090 100644
--- a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java
+++ b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java
@@ -5,171 +5,207 @@
/**
- * @author Nils Diewald
*
- * MultiTermToken represents a segment in a MultiTermTokenStream.
+ * A MultiTermToken represents a set of {@link MultiTerm MultiTerms}
+ * starting at the same position, i.e. represents a segment
+ * in a {@link MultiTermTokenStream}.
+ *
+ * <blockquote><pre>
+ * MultiTermToken mtt = new MultiTermToken("t:test", "a:abbruch");
+ * mtt.add("b:banane");
+ * System.err.println(mtt.toString());
+ * // [t:test|a:abbruch|b:banane]
+ * </pre></blockquote>
+ *
+ * @author diewald
*/
public class MultiTermToken {
public int start, end = 0;
public List<MultiTerm> terms;
-
private static short i = 0;
+
/**
- * The constructor.
+ * Construct a new MultiTermToken by passing a stream of
+ * {@link MultiTerm MultiTerms}.
*
- * @param terms Take at least one MultiTerm object for a token.
+ * @param terms Take at least one {@link MultiTerm} object for a token.
*/
- public MultiTermToken (MultiTerm term, MultiTerm ... moreTerms) {
- this.terms = new ArrayList<MultiTerm>(16);
+ public MultiTermToken (MultiTerm terms, MultiTerm ... moreTerms) {
+ this.terms = new ArrayList<MultiTerm>(16);
+
+ // Start position is not equal to end position
+ if (terms.start != terms.end) {
+ this.start = terms.start;
+ this.end = terms.end;
+ };
- if (term.start != term.end) {
- this.start = term.start;
- this.end = term.end;
- };
+ terms.posIncr = 1;
+ this.terms.add( terms );
- term.posIncr = 1;
- terms.add( term );
-
- // Further elements on same position
- for (i = 0; i < moreTerms.length; i++) {
- term = moreTerms[i];
- term.posIncr = 0;
- terms.add(term);
- };
+ // Further elements on same position
+ for (i = 0; i < moreTerms.length; i++) {
+ moreTerms[i].posIncr = 0;
+ this.terms.add(moreTerms[i]);
+ };
};
/**
- * The constructor.
+ * Construct a new MultiTermToken by passing a {@link MultiTerm}
+ * represented as a prefixed string.
*
- * @param prefix A term prefix.
- * @param surface A surface string.
+ * @param prefix The term prefix.
+ * @param surface The term surface.
+ * @see MultiTerm
*/
public MultiTermToken (char prefix, String surface) {
- this.terms = new ArrayList<MultiTerm>(16);
+ this.terms = new ArrayList<MultiTerm>(16);
- MultiTerm term = new MultiTerm(prefix, surface);
+ // Create a new MultiTerm
+ MultiTerm term = new MultiTerm(prefix, surface);
- this.setOffset(term.start, term.end);
-
- // First word element
- term.posIncr = 1;
- terms.add( term );
+ this.setOffset(term.start, term.end);
+
+ // First word element
+ term.posIncr = 1;
+ terms.add( term );
};
/**
- * The constructor.
+ * Construct a new MultiTermToken by passing a stream of
+ * {@link MultiTerm MultiTerms} represented as strings.
*
- * @param prefix At least one term surface string.
+ * @param terms Take at least one {@link MultiTerm} string for a token.
*/
- public MultiTermToken (String surface, String ... moreTerms) {
- this.terms = new ArrayList<MultiTerm>(16);
+ public MultiTermToken (String terms, String ... moreTerms) {
+ this.terms = new ArrayList<MultiTerm>(16);
- MultiTerm term = new MultiTerm(surface);
+ MultiTerm term = new MultiTerm(terms);
+ this.setOffset(term.start, term.end);
- this.setOffset(term.start, term.end);
+ // First word element
+ term.posIncr = 1;
+ this.terms.add( term );
- // First word element
- term.posIncr = 1;
- terms.add( term );
-
- // Further elements on same position
- for (i = 0; i < moreTerms.length; i++) {
- term = new MultiTerm( moreTerms[i] );
- this.setOffset(term.start, term.end);
- term.posIncr = 0;
- terms.add(term);
- };
+ // Further elements on same position
+ for (i = 0; i < moreTerms.length; i++) {
+ term = new MultiTerm( moreTerms[i] );
+ this.setOffset(term.start, term.end);
+ term.posIncr = 0;
+ this.terms.add(term);
+ };
};
/**
- * Add a new term to the MultiTermToken.
+ * Add a new {@link MultiTerm} to the MultiTermToken.
*
- * @param mt A MultiTerm.
+ * @param term A {@link MultiTerm} object.
+ * @return The {@link MultiTermToken} object for chaining.
*/
- public void add (MultiTerm mt) {
- mt.posIncr = 0;
- this.setOffset(mt.start, mt.end);
- terms.add(mt);
+ public MultiTermToken add (MultiTerm term) {
+ term.posIncr = 0;
+ this.setOffset(term.start, term.end);
+ terms.add(term);
+ return this;
};
/**
- * Add a new term to the MultiTermToken.
+ * Add a new {@link MultiTerm} to the MultiTermToken.
*
- * @param term A surface string.
+ * @param term A MultiTerm represented as a surface string.
+ * @return The {@link MultiTermToken} object for chaining.
*/
- public void add (String term) {
- if (term.length() == 0)
- return;
- MultiTerm mt = new MultiTerm(term);
- this.setOffset(mt.start, mt.end);
- mt.posIncr = 0;
- terms.add(mt);
- };
-
- /**
- * Add a new term to the MultiTermToken.
- *
- * @param prefix A prefix character for the surface string.
- * @param term A surface string.
- */
- public void add (char prefix, String term) {
- if (term.length() == 0)
- return;
- MultiTerm mt = new MultiTerm(prefix, term);
- this.setOffset(mt.start, mt.end);
- mt.posIncr = 0;
- terms.add(mt);
+ public MultiTermToken add (String term) {
+ if (term.length() == 0)
+ return this;
+ MultiTerm mt = new MultiTerm(term);
+ this.setOffset(mt.start, mt.end);
+ mt.posIncr = 0;
+ terms.add(mt);
+ return this;
};
/**
- * Sets the offset information of the MultiTermToken.
+ * Add a new {@link MultiTerm} to the MultiTermToken.
+ *
+ * @param prefix A MultiTerm prefix.
+ * @param term A MultiTerm represented as a surface string.
+ * @return The {@link MultiTermToken} object for chaining.
+ */
+ public MultiTermToken add (char prefix, String term) {
+ if (term.length() == 0)
+ return this;
+ MultiTerm mt = new MultiTerm(prefix, term);
+ this.setOffset(mt.start, mt.end);
+ mt.posIncr = 0;
+ terms.add(mt);
+ return this;
+ };
+
+
+ /**
+ * Set the start and end character offset information
+ * of the MultiTermToken.
*
* @param start The character position of the token start.
* @param end The character position of the token end.
+ * @return The {@link MultiTermToken} object for chaining.
*/
- public void setOffset (int start, int end) {
- if (start != end) {
- this.start = (this.start == 0 || start < this.start) ? start : this.start;
- this.end = end > this.end ? end : this.end;
- };
+ public MultiTermToken setOffset (int start, int end) {
+
+ // No value to set - offsets indicating a null string
+ if (start != end) {
+ this.start =
+ (this.start == 0 || start < this.start) ?
+ start : this.start;
+
+ this.end = end > this.end ? end : this.end;
+ };
+
+ return this;
};
+
+ /**
+ * Get the number of {@link MultiTerm MultiTerms}
+ * in the MultiTermToken.
+ *
+ * @return The number of {@link MultiTerm MultiTerms}
+ * in the MultiTermToken.
+ */
+ public int getSize () {
+ return this.terms.size();
+ };
+
+
+
/**
* Serialize the MultiTermToken to a string.
*
- * @return A string representation of the token, with leading offset information.
+ * @return A string representation of the MultiTermToken,
+ * with leading offset information.
*/
public String toString () {
- StringBuffer sb = new StringBuffer();
+ StringBuffer sb = new StringBuffer();
+ sb.append('[');
+ if (this.start != this.end) {
+ sb.append('(')
+ .append(this.start)
+ .append('-')
+ .append(this.end)
+ .append(')');
+ };
- sb.append('[');
- if (this.start != this.end) {
- sb.append('(')
- .append(this.start)
- .append('-')
- .append(this.end)
- .append(')');
- };
-
- i = 0;
- for (; i < this.terms.size() - 1; i++) {
- sb.append(this.terms.get(i).toString()).append('|');
- };
- sb.append(this.terms.get(i).toString()).append(']');
-
- return sb.toString();
- };
-
- /**
- * Return the number of MultiTerms in the MultiTermToken.
- */
- public int size () {
- return this.terms.size();
+ for (i = 0; i < this.terms.size() - 1; i++) {
+ sb.append(this.terms.get(i).toString()).append('|');
+ };
+ sb.append(this.terms.get(i).toString()).append(']');
+
+ return sb.toString();
};
};
diff --git a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java
index c869c0a..8a7f3a7 100644
--- a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java
+++ b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java
@@ -9,9 +9,7 @@
import java.util.regex.*;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -20,298 +18,368 @@
import java.io.IOException;
/*
- Todo:
- - !Payload is [4ByteStartOffset][14BitEndOffset-startOffset][1BitBooleanIfSpan][1BitBooleanIfOpen]
- - Payload is [4ByteOffsetStart][4ByteOffsetStart]
-*/
+ * Todo:
+ * - !Payload is
+ * [4ByteStartOffset][14BitEndOffset-startOffset]
+ * [1BitBooleanIfSpan][1BitBooleanIfOpen]
+ * - Payload is
+ * [4ByteOffsetStart][4ByteOffsetStart]
+ */
/**
- * @author Nils Diewald
- * @version 0.3
+ * MultiTermTokenStream extends Lucenes {@link TokenStream}
+ * to work with {@link MultiTermToken MultiTermTokens}.
*
- * MultiTermTokenStream extends Lucenes TokenStream class to work with MultiTermTokens.
+ * <blockquote><pre>
+ * MultiTermTokenStream mtts = new MultiTermTokenStream(
+ * "[s:den#0-3|i:den|p:DET|l:der|m:c:acc|m:n:sg|m:masc]"
+ * );
+ * </pre></blockquote>
*
- * @see org.apache.lucene.analysis.TokenStream
+ * @author diewald
+ * @see TokenStream
*/
public class MultiTermTokenStream extends TokenStream {
private CharTermAttribute charTermAttr;
private PositionIncrementAttribute posIncrAttr;
private PayloadAttribute payloadAttr;
-
- /*
- TODO: Update to new Tokeanstream API
- http://www.hankcs.com/program/java/lucene-4-6-1-java-lang-illegalstateexception-tokenstream-contract-violation.html
- */
-
- private static final Pattern pattern = Pattern.compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]");
+ private static final Pattern pattern =
+ Pattern.compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]");
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);
private List<MultiTermToken> multiTermTokens;
- private int mttIndex = 0, mtIndex = 0;
+ private int mttIndex = 0,
+ mtIndex = 0;
private static short i = 0;
-
/**
- * The empty Constructor.
+ * Construct a new MultiTermTokenStream object.
*/
public MultiTermTokenStream () {
this.charTermAttr = this.addAttribute(CharTermAttribute.class);
this.posIncrAttr = this.addAttribute(PositionIncrementAttribute.class);
- this.payloadAttr = this.addAttribute(PayloadAttribute.class);
- this.multiTermTokens = new ArrayList<MultiTermToken>(100);
+ this.payloadAttr = this.addAttribute(PayloadAttribute.class);
+ this.multiTermTokens = new ArrayList<MultiTermToken>(100);
};
/**
- * The Constructor.
+ * Construct a new MultiTermTokenStream object
*
- * @param stream The MultiTermTokenStream as a string representation.
+ * @param stream The stream as a string representation.
*/
public MultiTermTokenStream (String stream) {
- this();
- this._fromString(stream);
- };
+ this();
+ this._fromString(stream);
+ };
+
/**
- * The Constructor.
+ * Construct a new MultiTermTokenStream object
*
- * @param stream The MultiTermTokenStream as a reader object.
+ * @param stream The stream as a {@link Reader} object.
+ * @throws IOException
*/
public MultiTermTokenStream (Reader stream) throws IOException {
- this();
+ this();
- StringBuilder sb = new StringBuilder(4096);
- char[] buf = new char[128];
- int i;
- while ((i = stream.read(buf)) > 0) {
- sb.append(buf, 0, i);
- };
- this._fromString(sb.toString());
+ StringBuilder sb = new StringBuilder(4096);
+ char[] buf = new char[128];
+
+ int j;
+ while ((j = stream.read(buf)) > 0)
+ sb.append(buf, 0, j);
+
+ this._fromString(sb.toString());
};
/**
- * Add a MultiTermToken to the end of the MultiTermTokenStream.
+ * Append a {@link MultiTermToken} to the MultiTermTokenStream.
*
- * @param mtt A MultiTermToken.
+ * @param mtt A {@link MultiTermToken}.
+ * @return The {@link MultiTermTokenStream} object for chaining.
*/
- public void addMultiTermToken (MultiTermToken mtt) {
- this.multiTermTokens.add(mtt);
+ public MultiTermTokenStream addMultiTermToken (MultiTermToken mtt) {
+ this.multiTermTokens.add(mtt);
+ return this;
};
/**
- * Add a MultiTermToken by means of MultiTerms to the end of
- * the MultiTermTokenStream.
+ * Append a {@link MultiTermToken} to the MultiTermTokenStream
+ * by means of a set of {@link MultiTerm MultiTerms}.
*
- * @param term At least one MultiTerm.
+ * @param mts A list of {@link MultiTerm} objects.
+ * @return The {@link MultiTermTokenStream} object for chaining.
*/
- public void addMultiTermToken (MultiTerm term, MultiTerm ... moreTerms) {
- this.addMultiTermToken(new MultiTermToken(term, moreTerms));
+ public MultiTermTokenStream addMultiTermToken
+ (MultiTerm mts, MultiTerm ... moreTerms) {
+ this.addMultiTermToken(new MultiTermToken(mts, moreTerms));
+ return this;
};
/**
- * Add a MultiTermToken by means of a single MultiTerm to the end of
- * the MultiTermTokenStream.
+ * Append a {@link MultiTermToken} to the MultiTermTokenStream
+ * by means of a single {@link MultiTerm} as a prefixed term.
*
- * @param prefix A prefix character of a surface form of a MultiTerm.
- * @param surface A surface string of a MultiTerm.
+ * @param prefix A prefix character of a surface form of a {@link MultiTerm}.
+ * @param surface A surface string of a {@link MultiTerm}.
+ * @return The {@link MultiTermTokenStream} object for chaining.
*/
- public void addMultiTermToken (char prefix, String surface) {
- this.addMultiTermToken(new MultiTermToken(prefix, surface));
+ public MultiTermTokenStream addMultiTermToken
+ (char prefix, String surface) {
+ this.addMultiTermToken(new MultiTermToken(prefix, surface));
+ return this;
};
/**
- * Add a MultiTermToken by means of a a series of surface strings
- * to the end of the MultiTermTokenStream.
+ * Append a {@link MultiTermToken} to the MultiTermTokenStream
+ * by means of {@link MultiTerm MultiTerm} represented as a set
+ * of terms represented as strings.
*
- * @param surface At least one surface string of a MultiTerm.
+ * @param surface At least one surface string of a {@link MultiTerm}.
+ * @return The {@link MultiTermTokenStream} object for chaining.
*/
- public void addMultiTermToken (String surface, String ... moreTerms) {
- this.addMultiTermToken(new MultiTermToken(surface, moreTerms));
+ public MultiTermTokenStream addMultiTermToken
+ (String surface, String ... moreTerms) {
+ this.addMultiTermToken(new MultiTermToken(surface, moreTerms));
+ return this;
};
/**
* Add meta information to the MultiTermTokenStream.
*
+ * <strong>This is experimental!</strong>
+ *
* @param key A string for denoting the meta information.
* @param value The value of the meta key as a string.
+ * @return The {@link MultiTermTokenStream} object for chaining.
*/
- public void addMeta (String key, String value) {
- MultiTerm mt = new MultiTerm('-', key);
- mt.setPayload(value);
- this.multiTermTokens.get(0).add(mt);
+ public MultiTermTokenStream addMeta (String key, String value) {
+ MultiTerm mt = new MultiTerm('-', key);
+ mt.setPayload(value);
+ this.multiTermTokens.get(0).add(mt);
+ return this;
};
/**
* Add meta information to the MultiTermTokenStream.
*
+ * <strong>This is experimental!</strong>
+ *
* @param key A string for denoting the meta information.
* @param value The value of the meta key as a byte array.
+ * @return The {@link MultiTermTokenStream} object for chaining.
*/
- public void addMeta (String key, byte[] value) {
- MultiTerm mt = new MultiTerm('-', key);
- mt.setPayload(value);
- this.multiTermTokens.get(0).add(mt);
+ public MultiTermTokenStream addMeta (String key, byte[] value) {
+ MultiTerm mt = new MultiTerm('-', key);
+ mt.setPayload(value);
+ this.multiTermTokens.get(0).add(mt);
+ return this;
};
/**
* Add meta information to the MultiTermTokenStream.
*
+ * <strong>This is experimental!</strong>
+ *
* @param key A string for denoting the meta information.
* @param value The value of the meta key as a short value.
+ * @return The {@link MultiTermTokenStream} object for chaining.
*/
- public void addMeta (String key, short value) {
- MultiTerm mt = new MultiTerm('-', key);
- mt.setPayload(value);
- this.multiTermTokens.get(0).add(mt);
+ public MultiTermTokenStream addMeta (String key, short value) {
+ MultiTerm mt = new MultiTerm('-', key);
+ mt.setPayload(value);
+ this.multiTermTokens.get(0).add(mt);
+ return this;
};
/**
* Add meta information to the MultiTermTokenStream.
*
+ * <strong>This is experimental!</strong>
+ *
* @param key A string for denoting the meta information.
* @param value The value of the meta key as a long value.
+ * @return The {@link MultiTermTokenStream} object for chaining.
*/
- public void addMeta (String key, long value) {
- MultiTerm mt = new MultiTerm('-', key);
- mt.setPayload(value);
- this.multiTermTokens.get(0).add(mt);
+ public MultiTermTokenStream addMeta (String key, long value) {
+ MultiTerm mt = new MultiTerm('-', key);
+ mt.setPayload(value);
+ this.multiTermTokens.get(0).add(mt);
+ return this;
};
/**
* Add meta information to the MultiTermTokenStream.
*
+ * <strong>This is experimental!</strong>
+ *
* @param key A string for denoting the meta information.
* @param value The value of the meta key as a integer value.
+ * @return The {@link MultiTermTokenStream} object for chaining.
*/
- public void addMeta (String key, int value) {
- MultiTerm mt = new MultiTerm('-', key);
- mt.setPayload(value);
- this.multiTermTokens.get(0).add(mt);
+ public MultiTermTokenStream addMeta (String key, int value) {
+ MultiTerm mt = new MultiTerm('-', key);
+ mt.setPayload(value);
+ this.multiTermTokens.get(0).add(mt);
+ return this;
};
/**
+ * Get a {@link MultiTermToken} by index.
+ *
+ * @param index The index position of a {@link MultiTermToken}
+ * in the {@link MultiTermTokenStream}.
+ * @return A {@link MultiTermToken}.
+ */
+ public MultiTermToken get (int index) {
+ return this.multiTermTokens.get(index);
+ };
+
+
+ /**
+ * Get the number of {@link MultiTermToken MultiTermTokens}
+ * in the stream.
+ *
+ * @return The number of {@link MultiTermToken MultiTermTokens}
+ * in the stream.
+ */
+ public int getSize () {
+ return this.multiTermTokens.size();
+ };
+
+
+ /**
+ * Serialize the MultiTermTokenStream to a string.
+ *
+ * @return The MultiTermTokenStream as a string.
+ */
+ public String toString () {
+ StringBuffer sb = new StringBuffer();
+ for (MultiTermToken mtt : this.multiTermTokens) {
+ sb.append( mtt.toString() );
+ };
+ return sb.toString();
+ };
+
+
+ // Deserialize a string
+ private void _fromString (String stream) {
+ Matcher matcher = pattern.matcher(stream);
+
+ while (matcher.find()) {
+ String[] seg = matcher.group(1).split("\\|");
+ MultiTermToken mtt = new MultiTermToken( seg[0] );
+
+ for (i = 1; i < seg.length; i++)
+ mtt.add(seg[i]);
+
+ this.addMultiTermToken(mtt);
+ };
+ };
+
+
+ /*
* Increment the token in the MultiTermTokenStream.
* This overrides the function in Lucene's TokenStream.
*/
@Override
public final boolean incrementToken() throws IOException {
- this.payloadAttr.setPayload(null);
+ this.payloadAttr.setPayload(null);
- // Last token reached
- if (this.multiTermTokens.size() == this.mttIndex) {
- reset();
- return false;
- };
+ // Last token reached
+ if (this.multiTermTokens.size() == this.mttIndex) {
+ reset();
+ return false;
+ };
- // Get current token
- MultiTermToken mtt = this.multiTermTokens.get( this.mttIndex );
+ // Get current token
+ MultiTermToken mtt = this.multiTermTokens.get( this.mttIndex );
- // Last term reached
- if (mtt.terms.size() == this.mtIndex) {
- this.mtIndex = 0;
- this.mttIndex++;
+ // Last term reached
+ if (mtt.terms.size() == this.mtIndex) {
+ this.mtIndex = 0;
+ this.mttIndex++;
- // Last term of last token reached
- if (this.multiTermTokens.size() == this.mttIndex) {
- reset();
- return false;
- }
+ // Last term of last token reached
+ if (this.multiTermTokens.size() == this.mttIndex) {
+ reset();
+ return false;
+ }
- // Get last token
- else {
- mtt = this.multiTermTokens.get( this.mttIndex );
- };
- };
+ // Get last token
+ else {
+ mtt = this.multiTermTokens.get( this.mttIndex );
+ };
+ };
- // Get current term
- MultiTerm mt = mtt.terms.get(this.mtIndex);
+ // Get current term
+ MultiTerm mt = mtt.terms.get(this.mtIndex);
- // Set the relative position to the former term
+ // Set the relative position to the former term
posIncrAttr.setPositionIncrement( mt.posIncr );
charTermAttr.setEmpty();
- charTermAttr.append( mt.term );
+ charTermAttr.append( mt.term );
- BytesRef payload = new BytesRef();
+ BytesRef payload = new BytesRef();
- // There is offset information
- if (mt.start != mt.end) {
- if (DEBUG)
- log.trace("MultiTerm with payload offset: {}-{}", mt.start, mt.end);
+ // There is offset information
+ if (mt.start != mt.end) {
+ if (DEBUG)
+ log.trace("MultiTerm with payload offset: {}-{}", mt.start, mt.end);
- // Add offsets to BytesRef payload
- payload.append(new BytesRef(int2byte(mt.start)));
- payload.append(new BytesRef(int2byte(mt.end)));
- };
+ // Add offsets to BytesRef payload
+ payload.append(new BytesRef(int2byte(mt.start)));
+ payload.append(new BytesRef(int2byte(mt.end)));
+ };
- // There is payload in the MultiTerm
- if (mt.payload != null) {
- payload.append(mt.payload);
- if (DEBUG)
- log.trace("Create payload[1] {}", payload.toString());
- };
+ // There is payload in the MultiTerm
+ if (mt.payload != null) {
+ payload.append(mt.payload);
+ if (DEBUG)
+ log.trace("Create payload[1] {}", payload.toString());
+ };
+
+ // There is payload in the current token to index
+ if (payload.length > 0) {
+ payloadAttr.setPayload(payload);
+ if (DEBUG)
+ log.trace("Set payload[2] {}", payload.toString());
+ };
- // There is payload in the current token to index
- if (payload.length > 0) {
- payloadAttr.setPayload(payload);
- if (DEBUG)
- log.trace("Set payload[2] {}", payload.toString());
- };
+ // Some debug loggings
+ if (DEBUG) {
+ StringBuilder sb = new StringBuilder("Index: [");
+ sb.append(mt.term);
+ if (payload.length > 0)
+ sb.append('$').append(payload.toString());
+ sb.append(']');
+ sb.append(" with increment ").append(mt.posIncr);
+
+ log.trace(sb.toString());
+ };
- if (DEBUG) {
- StringBuilder sb = new StringBuilder("Index: [");
- sb.append(mt.term);
- if (payload.length > 0)
- sb.append('$').append(payload.toString());
- sb.append(']');
- sb.append(" with increment ").append(mt.posIncr);
-
- log.trace(sb.toString());
- };
-
- this.mtIndex++;
-
+ this.mtIndex++;
return true;
};
- public String toString () {
- StringBuffer sb = new StringBuffer();
- for (MultiTermToken mtt : this.multiTermTokens) {
- sb.append( mtt.toString() );
- };
- return sb.toString();
- };
-
- private void _fromString (String stream) {
- Matcher matcher = pattern.matcher(stream);
-
- while (matcher.find()) {
- String[] seg = matcher.group(1).split("\\|");
- MultiTermToken mtt = new MultiTermToken( seg[0] );
-
- for (i = 1; i < seg.length; i++)
- mtt.add(seg[i]);
-
- this.addMultiTermToken(mtt);
- };
- };
-
@Override
public void reset() {
- this.mttIndex = 0;
- this.mtIndex = 0;
+ this.mttIndex = 0;
+ this.mtIndex = 0;
};
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/SpanElementQuery.java b/src/main/java/de/ids_mannheim/korap/query/SpanElementQuery.java
index 78309e2..1e3f5f6 100644
--- a/src/main/java/de/ids_mannheim/korap/query/SpanElementQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/query/SpanElementQuery.java
@@ -17,21 +17,21 @@
/**
* SpanElementQuery retrieves {@link ElementSpans} which are special
- * {@link Term Terms} with prefix "<>". Unlike {@link TermSpans} consisting of
- * only one term, ElementSpans may contains more than one term comprising a
+ * {@link Term Terms} with prefix "<>".
+ * Unlike {@link TermSpans} ElementSpans may span multiple tokens comprising a
* phrase, a clause, a sentence and so on. <br/>
* <br/>
* Examples of {@link ElementSpans} are
*
* <ul>
- * <li>sentences indexed as <>:s
+ * <li>sentences indexed as <>:s
*
* <pre>
* SpanElementQuery seq = new SpanElementQuery("tokens", "s");
* </pre>
*
* </li>
- * <li>paragraphs indexed as <>:p
+ * <li>paragraphs indexed as <>:p
*
* <pre>
* SpanElementQuery seq = new SpanElementQuery("tokens", "p");
@@ -45,7 +45,6 @@
* @author margaretha
*/
public class SpanElementQuery extends SpanWithIdQuery {
- // private SpanTermQuery termQuery;
private static Term elementTerm;
private String elementStr;
@@ -57,18 +56,18 @@
*/
public SpanElementQuery(String field, String term) {
super(new SpanTermQuery((elementTerm = new Term(field, "<>:" + term))),
- true);
+ true);
this.elementStr = term;
- // this.termQuery = (SpanTermQuery) this.getFirstClause();
- // this.elementTerm = termQuery.getTerm();
};
+
@Override
public Spans getSpans(final AtomicReaderContext context, Bits acceptDocs,
Map<Term, TermContext> termContexts) throws IOException {
return new ElementSpans(this, context, acceptDocs, termContexts);
};
+
/**
* Returns the element name or string, for instance "s" for sentence
* elements.
@@ -79,6 +78,7 @@
return elementStr;
};
+
/**
* Sets the element name or string, for instance "s" for sentence elements.
*
@@ -88,19 +88,24 @@
this.elementStr = elementStr;
}
+
@Override
public SimpleSpanQuery clone() {
- SpanElementQuery sq = new SpanElementQuery(this.getField(),
- this.getElementStr());
+ SpanElementQuery sq = new SpanElementQuery(
+ this.getField(),
+ this.getElementStr()
+ );
sq.setBoost(this.getBoost());
return sq;
};
+
@Override
public void extractTerms(Set<Term> terms) {
terms.add(elementTerm);
};
+
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder("<");
@@ -109,6 +114,7 @@
return buffer.append(" />").toString();
};
+
@Override
public int hashCode() {
final int prime = 37; // Instead of 31
@@ -118,6 +124,7 @@
return result;
};
+
@Override
public boolean equals(Object obj) {
if (this == obj)
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/WithinSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/WithinSpans.java
index c0d32cc..532d2d4 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/WithinSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/WithinSpans.java
@@ -21,14 +21,11 @@
/**
* SpanWithinQuery is DEPRECATED and will
* be replaced by SpanPositionQuery in the near future
+ *
+ * TODO: Support exclusivity
+ * TODO: Use the term "queue" and implement it similar to SpanOrQuery
*/
-/*
- TODO: Use the flag in KorapQuery!
- TODO: Support exclusivity
- TODO: Use the term queue and implement it similar to SpanOrQuery
-*/
-
/**
* Compare two spans and check how they relate positionally.
*
@@ -68,21 +65,19 @@
/*
Supported flags are currently:
- ov -> 0 | overlap: A & B != empty
- rov -> 2 | real overlap: A & B != empty and
- ((A | B) != A or
+ ov -> 0 | overlap: A & B != empty
+ rov -> 2 | real overlap: A & B != empty and
+ ((A | B) != A or
(A | B) != B)
- in -> 4 | within: A | B = A
- rin -> 6 | real within: A | B = A and
- A & B != A
- ew -> 8 | endswith: A | B = A and
- A.start = B.start
- sw -> 10 | startswith: A | B = A and
- A.end = B.end
- m -> 12 | A = B
-
- This may change in case the system switches to 16bit vector
- */
+ in -> 4 | within: A | B = A
+ rin -> 6 | real within: A | B = A and
+ A & B != A
+ ew -> 8 | endswith: A | B = A and
+ A.start = B.start
+ sw -> 10 | startswith: A | B = A and
+ A.end = B.end
+ m -> 12 | A = B
+ */
public static final byte
OVERLAP = (byte) 0,
REAL_OVERLAP = (byte) 2,
@@ -97,9 +92,14 @@
// Contains the query
private SpanWithinQuery query;
- // Contains the spans
- private final Spans embeddedSpans, wrapSpans;
+ // Representing the first operand
+ private final Spans wrapSpans;
+ // Representing the second operand
+ private final Spans embeddedSpans;
+
+ // Check flag if the current constellation
+ // was checked yet
private boolean tryMatch = true;
private LinkedList<KorapLongSpan>
diff --git a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java
index f88b194..138c031 100644
--- a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java
+++ b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java
@@ -11,7 +11,9 @@
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
-
+/**
+ * @author diewald
+ */
@RunWith(JUnit4.class)
public class TestMultiTermToken {
@@ -22,14 +24,17 @@
mtt.add("b:banane");
assertEquals("[t:test|a:abbruch|b:banane]", mtt.toString());
mtt.add("c:chaos#21-26");
- assertEquals("[(21-26)t:test|a:abbruch|b:banane|c:chaos#21-26]",
- mtt.toString());
+ assertEquals(
+ "[(21-26)t:test|a:abbruch|b:banane|c:chaos#21-26]",
+ mtt.toString()
+ );
mtt.add("d:dadaismus#21-28$vergleich");
assertEquals(
"[(21-28)t:test|a:abbruch|b:banane|c:chaos#21-26|" +
"d:dadaismus#21-28$vergleich]",
mtt.toString()
);
+ assertEquals(5, mtt.getSize());
};
@Test
@@ -43,5 +48,6 @@
"c:cannonball#34-45$tatsache]", mtt.toString());
assertEquals(23, mtt.start);
assertEquals(45, mtt.end);
+ assertEquals(3, mtt.getSize());
};
};
diff --git a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java
index ff20192..04f08b6 100644
--- a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java
+++ b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java
Binary files differ