Presort Multiterms in MultiTermTokenStream
diff --git a/Changes b/Changes
index c6d33fa..0b28d58 100644
--- a/Changes
+++ b/Changes
@@ -3,6 +3,9 @@
- [performance] Updated Lucene dependency from 4.5.1 to 4.10.3,
Updated Jackson dependency from 2.4.0 to 2.4.4,
Updated Jersey dependency from 2.4.1 to 2.15 (diewald)
+ - [feature] Presorting of element terms in the index for coherent
+ SpanQuery sorting (diewald)
+ Warning: This is a breaking change!
0.49.3 2015-02-03
- [documentation] Improved documentation for API classes (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java b/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java
index 1f72e8d..754268d 100644
--- a/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java
+++ b/src/main/java/de/ids_mannheim/korap/analysis/MultiTerm.java
@@ -35,11 +35,10 @@
*
* @author diewald
*/
-public class MultiTerm {
+public class MultiTerm implements Comparable<MultiTerm> {
public int start, end = 0;
public String term = null;
- public Integer posIncr = 1;
- public boolean storeOffsets = false;
+ private boolean storeOffsets = false;
public BytesRef payload = null;
private static ByteBuffer bb = ByteBuffer.allocate(8);
@@ -306,6 +305,21 @@
return sb.toString();
};
+ @Override
+ public int compareTo (MultiTerm o) {
+ if (this.payload == null || o.payload == null)
+ return 0;
+ if (this.end < o.end)
+ return -1;
+ else if (this.end > o.end)
+ return 1;
+ else if (this.start < o.start)
+ return -1;
+ else if (this.start > o.start)
+ return 1;
+ return 0;
+ };
+
/**
* Represent the MultiTerm as a string.
diff --git a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java
index 5332090..b3f0375 100644
--- a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java
+++ b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermToken.java
@@ -1,6 +1,5 @@
package de.ids_mannheim.korap.analysis;
-import de.ids_mannheim.korap.analysis.MultiTerm;
import java.util.*;
@@ -20,10 +19,9 @@
* @author diewald
*/
public class MultiTermToken {
- public int start, end = 0;
public List<MultiTerm> terms;
private static short i = 0;
-
+ private boolean sorted = false;
/**
* Construct a new MultiTermToken by passing a stream of
@@ -34,18 +32,10 @@
public MultiTermToken (MultiTerm terms, MultiTerm ... moreTerms) {
this.terms = new ArrayList<MultiTerm>(16);
- // Start position is not equal to end position
- if (terms.start != terms.end) {
- this.start = terms.start;
- this.end = terms.end;
- };
-
- terms.posIncr = 1;
this.terms.add( terms );
// Further elements on same position
for (i = 0; i < moreTerms.length; i++) {
- moreTerms[i].posIncr = 0;
this.terms.add(moreTerms[i]);
};
};
@@ -65,10 +55,7 @@
// Create a new MultiTerm
MultiTerm term = new MultiTerm(prefix, surface);
- this.setOffset(term.start, term.end);
-
// First word element
- term.posIncr = 1;
terms.add( term );
};
@@ -83,17 +70,13 @@
this.terms = new ArrayList<MultiTerm>(16);
MultiTerm term = new MultiTerm(terms);
- this.setOffset(term.start, term.end);
// First word element
- term.posIncr = 1;
this.terms.add( term );
// Further elements on same position
for (i = 0; i < moreTerms.length; i++) {
term = new MultiTerm( moreTerms[i] );
- this.setOffset(term.start, term.end);
- term.posIncr = 0;
this.terms.add(term);
};
};
@@ -106,9 +89,8 @@
* @return The {@link MultiTermToken} object for chaining.
*/
public MultiTermToken add (MultiTerm term) {
- term.posIncr = 0;
- this.setOffset(term.start, term.end);
terms.add(term);
+ this.sorted = false;
return this;
};
@@ -122,11 +104,7 @@
public MultiTermToken add (String term) {
if (term.length() == 0)
return this;
- MultiTerm mt = new MultiTerm(term);
- this.setOffset(mt.start, mt.end);
- mt.posIncr = 0;
- terms.add(mt);
- return this;
+ return this.add(new MultiTerm(term));
};
@@ -140,34 +118,19 @@
public MultiTermToken add (char prefix, String term) {
if (term.length() == 0)
return this;
- MultiTerm mt = new MultiTerm(prefix, term);
- this.setOffset(mt.start, mt.end);
- mt.posIncr = 0;
- terms.add(mt);
- return this;
+ return this.add(new MultiTerm(prefix, term));
};
/**
- * Set the start and end character offset information
- * of the MultiTermToken.
+ * Get a {@link MultiTerm} by index.
*
- * @param start The character position of the token start.
- * @param end The character position of the token end.
- * @return The {@link MultiTermToken} object for chaining.
+ * @param index The index position of a {@link MultiTerm}
+ * in the {@link MultiTermToken}.
+ * @return A {@link MultiTerm}.
*/
- public MultiTermToken setOffset (int start, int end) {
-
- // No value to set - offsets indicating a null string
- if (start != end) {
- this.start =
- (this.start == 0 || start < this.start) ?
- start : this.start;
-
- this.end = end > this.end ? end : this.end;
- };
-
- return this;
+ public MultiTerm get (int index) {
+ return this.sort().terms.get(index);
};
@@ -183,6 +146,20 @@
};
+ /**
+ * Sort the {@link MultiTerm MultiTerms} in the correct order.
+ *
+ * @return The {@link MultiTermToken} object for chaining.
+ */
+ public MultiTermToken sort () {
+ if (this.sorted)
+ return this;
+
+ Collections.sort(this.terms);
+ this.sorted = true;
+ return this;
+ };
+
/**
* Serialize the MultiTermToken to a string.
@@ -191,16 +168,9 @@
* with leading offset information.
*/
public String toString () {
+ this.sort();
StringBuffer sb = new StringBuffer();
sb.append('[');
- if (this.start != this.end) {
- sb.append('(')
- .append(this.start)
- .append('-')
- .append(this.end)
- .append(')');
- };
-
for (i = 0; i < this.terms.size() - 1; i++) {
sb.append(this.terms.get(i).toString()).append('|');
};
diff --git a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java
index 8a7f3a7..521f7b2 100644
--- a/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java
+++ b/src/main/java/de/ids_mannheim/korap/analysis/MultiTermTokenStream.java
@@ -1,7 +1,5 @@
package de.ids_mannheim.korap.analysis;
-import de.ids_mannheim.korap.analysis.MultiTerm;
-import de.ids_mannheim.korap.analysis.MultiTermToken;
import static de.ids_mannheim.korap.util.KorapByte.*;
import org.apache.lucene.util.BytesRef;
@@ -119,8 +117,7 @@
*/
public MultiTermTokenStream addMultiTermToken
(MultiTerm mts, MultiTerm ... moreTerms) {
- this.addMultiTermToken(new MultiTermToken(mts, moreTerms));
- return this;
+ return this.addMultiTermToken(new MultiTermToken(mts, moreTerms));
};
@@ -134,8 +131,7 @@
*/
public MultiTermTokenStream addMultiTermToken
(char prefix, String surface) {
- this.addMultiTermToken(new MultiTermToken(prefix, surface));
- return this;
+ return this.addMultiTermToken(new MultiTermToken(prefix, surface));
};
@@ -149,8 +145,7 @@
*/
public MultiTermTokenStream addMultiTermToken
(String surface, String ... moreTerms) {
- this.addMultiTermToken(new MultiTermToken(surface, moreTerms));
- return this;
+ return this.addMultiTermToken(new MultiTermToken(surface, moreTerms));
};
@@ -310,6 +305,9 @@
// Get current token
MultiTermToken mtt = this.multiTermTokens.get( this.mttIndex );
+ // Sort the MultiTermToken
+ mtt.sort();
+
// Last term reached
if (mtt.terms.size() == this.mtIndex) {
this.mtIndex = 0;
@@ -331,7 +329,7 @@
MultiTerm mt = mtt.terms.get(this.mtIndex);
// Set the relative position to the former term
- posIncrAttr.setPositionIncrement( mt.posIncr );
+ posIncrAttr.setPositionIncrement( this.mtIndex == 0 ? 1 : 0 );
charTermAttr.setEmpty();
charTermAttr.append( mt.term );
@@ -368,7 +366,7 @@
if (payload.length > 0)
sb.append('$').append(payload.toString());
sb.append(']');
- sb.append(" with increment ").append(mt.posIncr);
+ sb.append(" with increment ").append(this.mtIndex == 0 ? 1 : 0);
log.trace(sb.toString());
};
diff --git a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java
index 138c031..9d46025 100644
--- a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java
+++ b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermToken.java
@@ -25,12 +25,12 @@
assertEquals("[t:test|a:abbruch|b:banane]", mtt.toString());
mtt.add("c:chaos#21-26");
assertEquals(
- "[(21-26)t:test|a:abbruch|b:banane|c:chaos#21-26]",
+ "[t:test|a:abbruch|b:banane|c:chaos#21-26]",
mtt.toString()
);
mtt.add("d:dadaismus#21-28$vergleich");
assertEquals(
- "[(21-28)t:test|a:abbruch|b:banane|c:chaos#21-26|" +
+ "[t:test|a:abbruch|b:banane|c:chaos#21-26|" +
"d:dadaismus#21-28$vergleich]",
mtt.toString()
);
@@ -40,14 +40,12 @@
@Test
public void multiTermTokenOffsets () {
MultiTermToken mtt = new MultiTermToken("t:test#23-27");
- assertEquals("[(23-27)t:test#23-27]", mtt.toString());
+ assertEquals("[t:test#23-27]", mtt.toString());
mtt.add("b:baum#34-45");
- assertEquals("[(23-45)t:test#23-27|b:baum#34-45]", mtt.toString());
+ assertEquals("[t:test#23-27|b:baum#34-45]", mtt.toString());
mtt.add("c:cannonball#34-45$tatsache");
- assertEquals("[(23-45)t:test#23-27|b:baum#34-45|" +
+ assertEquals("[t:test#23-27|b:baum#34-45|" +
"c:cannonball#34-45$tatsache]", mtt.toString());
- assertEquals(23, mtt.start);
- assertEquals(45, mtt.end);
assertEquals(3, mtt.getSize());
};
};
diff --git a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java
index 04f08b6..feade6f 100644
--- a/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java
+++ b/src/test/java/de/ids_mannheim/korap/analysis/TestMultiTermTokenStream.java
Binary files differ
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestIndex.java
index d1dd608..42d18ed 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestIndex.java
@@ -78,17 +78,11 @@
assertEquals(test.terms.get(0).term, "hunde");
assertEquals(test.terms.get(1).term, "pos:n");
assertEquals(test.terms.get(2).term, "m:gen:pl");
- assertEquals(test.terms.get(0).posIncr, 1, 1);
- assertEquals(test.terms.get(1).posIncr, 0, 1);
- assertEquals(test.terms.get(2).posIncr, 0, 1);
test = new MultiTermToken("hunde", "pos:n", "m:gen:pl");
assertEquals(test.terms.get(0).term, "hunde");
assertEquals(test.terms.get(1).term, "pos:n");
assertEquals(test.terms.get(2).term, "m:gen:pl");
- assertEquals(test.terms.get(0).posIncr, 1, 1);
- assertEquals(test.terms.get(1).posIncr, 0, 1);
- assertEquals(test.terms.get(2).posIncr, 0, 1);
};
private List initIndexer () throws IOException {