Merge empty tokens to larger extensions
Change-Id: I4f1248fc2651f676dbc635b09e6c1bd29d0f6953
diff --git a/Changes b/Changes
index 6c9d48f..76b694d 100644
--- a/Changes
+++ b/Changes
@@ -5,6 +5,9 @@
query planner (diewald)
- [performance] Treat term queries like ".+?", ".+", ".*?", and ".*"
as any-terms (diewald)
+ - [feature] Introduced SpanExpansionQueryWrapper (diewald)
+ - [performance] Sequences of empty tokens will now be merged into
+ a single extension, if possible (diewald)
0.55.6 2016-08-10
- [bugfix] distance with key "t" uses default foundry (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/query/QueryBuilder.java b/src/main/java/de/ids_mannheim/korap/query/QueryBuilder.java
index 72cfb23..96e4eb5 100644
--- a/src/main/java/de/ids_mannheim/korap/query/QueryBuilder.java
+++ b/src/main/java/de/ids_mannheim/korap/query/QueryBuilder.java
@@ -257,7 +257,7 @@
return ssq;
};
-
+
/**
* Create an empty query segment.
*
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanAlterQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanAlterQueryWrapper.java
index a7b55cf..8fd18cd 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanAlterQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanAlterQueryWrapper.java
@@ -83,7 +83,6 @@
if (term.maybeUnsorted())
this.maybeUnsorted = true;
-
this.isNull = false;
return this;
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanExpansionQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanExpansionQueryWrapper.java
new file mode 100644
index 0000000..7c4cf61
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanExpansionQueryWrapper.java
@@ -0,0 +1,68 @@
+package de.ids_mannheim.korap.query.wrap;
+
+import org.apache.lucene.search.RegexpQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.util.automaton.RegExp;
+import org.apache.lucene.index.Term;
+import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
+import de.ids_mannheim.korap.query.SpanExpansionQuery;
+import de.ids_mannheim.korap.util.QueryException;
+
+
+import java.util.*;
+
+/*
+ * TODO: SpanExpansionQueryWrapper currently does not support negative extensions!
+ */
+/**
+ * @author diewald
+ */
+
+public class SpanExpansionQueryWrapper extends SpanQueryWrapper {
+ private SpanQueryWrapper anchor;
+
+ // < 0 to the left of anchor span
+ // >= 0 to the right of anchor span
+ private int direction;
+
+ // if > 0, collect expansion offsets
+ // using this label
+ private byte classNumber;
+
+
+ public SpanExpansionQueryWrapper (SpanQueryWrapper anchor,
+ int min,
+ int max,
+ int direction,
+ byte classNumber
+ ) {
+ this.anchor = anchor;
+ this.isNull = false;
+ this.min = min;
+ this.max = max;
+ this.direction = direction;
+ this.classNumber = classNumber;
+ this.isExtended = true;
+ if (direction >= 0)
+ this.isExtendedToTheRight = true;
+ };
+
+ @Override
+ public boolean isNull () {
+ // Needs to be overwritten, as min and max do not indicate null value
+ return this.isNull;
+ };
+
+
+ @Override
+ public SpanQuery toFragmentQuery () throws QueryException {
+ return new SpanExpansionQuery(
+ this.anchor.retrieveNode(this.retrieveNode).toFragmentQuery(),
+ this.getMin(),
+ this.getMax(),
+ this.direction,
+ this.classNumber,
+ true
+ );
+ };
+};
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanQueryWrapper.java
index f31006d..f0221d5 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanQueryWrapper.java
@@ -95,6 +95,13 @@
return this.isOptional;
};
+
+ public SpanQueryWrapper isOptional (boolean opt) {
+ this.isOptional = opt;
+ return this;
+ };
+
+
/**
* Boolean value indicating that the wrapped query is
@@ -212,7 +219,6 @@
return this;
};
-
/**
* Check, if the wrapped query can be used as an
* anchor query in a sequence, i.e. a query that
@@ -413,8 +419,10 @@
*/
public String toString () {
String string = "" + (this.isNull() ? "isNull" : "notNull") + "-"
- + (this.isEmpty() ? "isEmpty" : "notEmpty") + "-"
- + (this.isOptional() ? "isOptional" : "notOptional");
+ + (this.isEmpty() ? "isEmpty" : "notEmpty") + "-"
+ + (this.isOptional() ? "isOptional" : "notOptional") + "-"
+ + (this.isExtendedToTheRight() ? "isExtendedToTheRight" : "notExtendedToTheRight");
+ ;
return string;
};
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java
index f100853..c5aac76 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java
@@ -18,7 +18,6 @@
/*
TODO: Make isNegative work!
- TODO: Make isEmpty work!
TODO: Evaluate if spanNext(spanNext(a,b),spanNext(c,d)) is faster
than spanNext(spanNext(spanNext(a,b),c),d)
TODO: Improve support for SpanElementQueryWrapper in constraints!
@@ -728,7 +727,7 @@
if (query == null)
return (SpanQuery) null;
- // NextQueries:
+ // NextQueries
if (!this.hasConstraints() && this.isInOrder()) {
for (; i < this.segments.size(); i++) {
@@ -742,6 +741,7 @@
return (SpanQuery) query;
};
+ // DistanceQueries with problems
if (this.hasConstraints() && this.isProblematic) {
throw new QueryException(613,
"Distance constraints not supported with empty, optional or negative operands");
@@ -960,7 +960,7 @@
};
- // Todo: Deal with negative and optional!
+ // Todo: Deal with negative, empty and optional!
// [base=der][base!=Baum]?
private SpanQueryWrapper _merge (SpanQueryWrapper anchor,
SpanQueryWrapper problem, boolean mergeLeft) throws QueryException {
@@ -980,15 +980,46 @@
log.trace("Problem is empty with class {}",
problem.getClassNumber());
- query = new SpanExpansionQuery(anchor.retrieveNode(
- this.retrieveNode).toFragmentQuery(),
- problem.isOptional() ? 0 : problem.getMin(),
- problem.getMax(), direction,
- problem.hasClass() ? problem.getClassNumber() : (byte) 0,
- true);
- SpanQueryWrapper sqw = new SpanSimpleQueryWrapper(query)
- .isExtended(true);
+ // Merge extensions!
+ if (!problem.hasClass() &&
+ !anchor.hasClass() &&
+ anchor.isExtended()) {
+ if (DEBUG)
+ log.trace("It may be possible to extend anchor with problem");
+
+ if (
+ // Further extend to the right ...
+ (direction >= 0 && anchor.isExtendedToTheRight) ||
+
+ // or the left
+ (direction < 0 && !anchor.isExtendedToTheRight)) {
+
+ if (DEBUG)
+ log.trace("Readjust min and max");
+
+ // Readjust the anchor
+ anchor.setMin(anchor.getMin() + problem.getMin());
+ anchor.setMax(anchor.getMax() + problem.getMax());
+
+ /*
+ * This is wrong - min is only relevant for extensions
+ if (anchor.getMin() > 0)
+ anchor.isOptional = false;
+ */
+ return anchor;
+ };
+ };
+
+ // Can't merge extensions
+ SpanQueryWrapper sqw = new SpanExpansionQueryWrapper(
+ anchor,
+ problem.isOptional() ? 0 : problem.getMin(),
+ problem.getMax(),
+ direction,
+ problem.hasClass() ? problem.getClassNumber() : (byte) 0
+ ).isExtended(true);
+
// Set right extension
if (direction >= 0)
sqw.isExtendedToTheRight(true);
@@ -1005,15 +1036,17 @@
log.trace("Problem is negative with class {}",
problem.getClassNumber());
- query = new SpanExpansionQuery(anchor.retrieveNode(
- this.retrieveNode).toFragmentQuery(), problem.retrieveNode(
- this.retrieveNode).toFragmentQuery(), problem.getMin(),
- problem.getMax(), direction,
- problem.hasClass() ? problem.getClassNumber() : (byte) 0,
- true);
+ // TODO: Should probably wrapped as well!
+ // A sequence of negative tokens may expand jointly!
+ query = new SpanExpansionQuery(anchor.retrieveNode(
+ this.retrieveNode).toFragmentQuery(), problem.retrieveNode(
+ this.retrieveNode).toFragmentQuery(), problem.getMin(),
+ problem.getMax(), direction,
+ problem.hasClass() ? problem.getClassNumber() : (byte) 0,
+ true);
SpanQueryWrapper sqw = new SpanSimpleQueryWrapper(query)
- .isExtended(true);
+ .isExtended(true);
// Set right extension
if (direction >= 0)
@@ -1025,24 +1058,29 @@
if (DEBUG)
log.trace("Problem is optional");
+ // [base=der][][base=Baum]?
// [base=der][base=baum]?
// [base=der]
- SpanAlterQueryWrapper saqw = new SpanAlterQueryWrapper(this.field,
- anchor);
+ SpanAlterQueryWrapper saqw = new SpanAlterQueryWrapper(
+ this.field,
+ anchor
+ );
// [base=der]
SpanSequenceQueryWrapper ssqw = new SpanSequenceQueryWrapper(
- this.field, anchor);
+ this.field,
+ anchor
+ );
// [base=der][base=baum]
- if (mergeLeft)
- ssqw.append(new SpanSimpleQueryWrapper(problem.retrieveNode(
- this.retrieveNode).toFragmentQuery()));
+ if (mergeLeft) {
+ ssqw.append(problem.isOptional(false));
+ }
// [base=baum][base=der]
- else
- ssqw.prepend(new SpanSimpleQueryWrapper(problem.retrieveNode(
- this.retrieveNode).toFragmentQuery()));
+ else {
+ ssqw.prepend(problem.isOptional(false));
+ }
saqw.or(ssqw);
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSimpleQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSimpleQueryWrapper.java
index 61608e5..3d9c76a 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSimpleQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSimpleQueryWrapper.java
@@ -3,6 +3,7 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
+import de.ids_mannheim.korap.util.QueryException;
public class SpanSimpleQueryWrapper extends SpanQueryWrapper {
private SpanQuery query;
@@ -25,6 +26,20 @@
this.query = query;
};
+ // This is similar to a clone
+ public SpanSimpleQueryWrapper (SpanQueryWrapper query) throws QueryException {
+ this.hasClass = query.hasClass();
+ this.isOptional = query.isOptional();
+ this.isNegative = query.isNegative();
+ this.isEmpty = query.isEmpty();
+ this.isExtended = query.isExtended();
+ this.isExtendedToTheRight = query.isExtendedToTheRight();
+ this.maybeUnsorted = query.maybeUnsorted();
+ this.retrieveNode = query.retrieveNode;
+ this.query = query.toFragmentQuery();
+ this.isNull = query.isNull();
+ };
+
@Override
public SpanQuery toFragmentQuery () {
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java b/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
index 09d690b..27f00f3 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
@@ -601,16 +601,16 @@
"focus(254: spanContain(<tokens:base/s:t />, {254: spanExpansion(tokens:s:der, []{1, 1}, right)}))");
};
- @Ignore
+ @Test
public void queryJSONregexRewrite2 () throws QueryException {
- // "der" [.+?]
+ // "der" [.*] [.*?] [.+] [.+?]
String json = getString(getClass().getResource(
"/queries/sequence/regex-rewrite-2.jsonld").getFile());
KrillQuery kq = new KrillQuery("tokens");
assertEquals(
kq.fromKoral(json).toQuery().toString(),
- "focus(254: spanContain(<tokens:base/s:t />, {254: spanExpansion(tokens:s:der, []{1, 4}, right)}))");
+ "focus(254: spanContain(<tokens:base/s:t />, {254: spanExpansion(tokens:s:der, []{4, 4}, right)}))");
};
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestSpanSequenceQueryJSON.java b/src/test/java/de/ids_mannheim/korap/query/TestSpanSequenceQueryJSON.java
index 97a9eb2..aff5cf8 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestSpanSequenceQueryJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestSpanSequenceQueryJSON.java
@@ -373,8 +373,8 @@
// Sonne [] Mond?
SpanQueryWrapper sqwi = jsonQueryFile("empty-followed-by-optionality.jsonld");
assertEquals(
- "focus(254: spanContain(<tokens:base/s:t />, {254: spanOr([spanExpansion(tokens:s:Sonne, []{1, 1}, right), spanNext(spanExpansion(tokens:s:Sonne, []{1, 1}, right), tokens:s:Mond)])}))",
- sqwi.toQuery().toString());
+ "focus(254: spanContain(<tokens:base/s:t />, {254: spanOr([spanExpansion(tokens:s:Sonne, []{1, 1}, right), spanNext(spanExpansion(tokens:s:Sonne, []{1, 1}, right), tokens:s:Mond)])}))",
+ sqwi.toQuery().toString());
// Could also be a distance at the end ... that's a query planner thing.
};