Merge empty tokens to larger extensions

Change-Id: I4f1248fc2651f676dbc635b09e6c1bd29d0f6953
diff --git a/Changes b/Changes
index 6c9d48f..76b694d 100644
--- a/Changes
+++ b/Changes
@@ -5,6 +5,9 @@
           query planner (diewald)
         - [performance] Treat term queries like ".+?", ".+", ".*?", and ".*"
           as any-terms (diewald)
+        - [feature] Introduced SpanExpansionQueryWrapper (diewald)
+        - [performance] Sequences of empty tokens will now be merged into
+          a single extension, if possible (diewald)
 
 0.55.6 2016-08-10
         - [bugfix] distance with key "t" uses default foundry (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/query/QueryBuilder.java b/src/main/java/de/ids_mannheim/korap/query/QueryBuilder.java
index 72cfb23..96e4eb5 100644
--- a/src/main/java/de/ids_mannheim/korap/query/QueryBuilder.java
+++ b/src/main/java/de/ids_mannheim/korap/query/QueryBuilder.java
@@ -257,7 +257,7 @@
         return ssq;
     };
 
-
+	
     /**
      * Create an empty query segment.
      * 
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanAlterQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanAlterQueryWrapper.java
index a7b55cf..8fd18cd 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanAlterQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanAlterQueryWrapper.java
@@ -83,7 +83,6 @@
         if (term.maybeUnsorted())
             this.maybeUnsorted = true;
 
-
         this.isNull = false;
         return this;
     };
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanExpansionQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanExpansionQueryWrapper.java
new file mode 100644
index 0000000..7c4cf61
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanExpansionQueryWrapper.java
@@ -0,0 +1,68 @@
+package de.ids_mannheim.korap.query.wrap;
+
+import org.apache.lucene.search.RegexpQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.util.automaton.RegExp;
+import org.apache.lucene.index.Term;
+import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
+import de.ids_mannheim.korap.query.SpanExpansionQuery;
+import de.ids_mannheim.korap.util.QueryException;
+
+
+import java.util.*;
+
+/*
+ * TODO: SpanExpansionQueryWrapper currently does not support negative extensions!
+ */
+/**
+ * @author diewald
+ */
+
+public class SpanExpansionQueryWrapper extends SpanQueryWrapper {
+    private SpanQueryWrapper anchor;
+
+	// < 0 	to the left of anchor span 
+    // >= 0  to the right of anchor span
+	private int direction;
+
+	// if > 0, collect expansion offsets
+	// using this label
+    private byte classNumber;
+
+
+    public SpanExpansionQueryWrapper (SpanQueryWrapper anchor,
+									  int min,
+									  int max,
+									  int direction,
+									  byte classNumber
+		) {
+		this.anchor = anchor;
+		this.isNull = false;
+		this.min = min;
+		this.max = max;
+		this.direction = direction;
+		this.classNumber = classNumber;
+		this.isExtended = true;
+		if (direction >= 0)
+			this.isExtendedToTheRight = true;
+    };
+
+	@Override
+    public boolean isNull () {
+		// Needs to be overwritten, as min and max do not indicate null value
+        return this.isNull;
+    };
+	
+
+	@Override
+    public SpanQuery toFragmentQuery () throws QueryException {
+		return new SpanExpansionQuery(
+			this.anchor.retrieveNode(this.retrieveNode).toFragmentQuery(),
+			this.getMin(),
+			this.getMax(),
+			this.direction,
+			this.classNumber,
+			true
+			);
+    };
+};
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanQueryWrapper.java
index f31006d..f0221d5 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanQueryWrapper.java
@@ -95,6 +95,13 @@
         return this.isOptional;
     };
 
+	
+	public SpanQueryWrapper isOptional (boolean opt) {
+        this.isOptional = opt;
+        return this;
+    };
+
+
 
     /**
      * Boolean value indicating that the wrapped query is
@@ -212,7 +219,6 @@
         return this;
     };
 
-
     /**
      * Check, if the wrapped query can be used as an
      * anchor query in a sequence, i.e. a query that
@@ -413,8 +419,10 @@
      */
     public String toString () {
         String string = "" + (this.isNull() ? "isNull" : "notNull") + "-"
-                + (this.isEmpty() ? "isEmpty" : "notEmpty") + "-"
-                + (this.isOptional() ? "isOptional" : "notOptional");
+			+ (this.isEmpty() ? "isEmpty" : "notEmpty") + "-"
+			+ (this.isOptional() ? "isOptional" : "notOptional") + "-"
+			+ (this.isExtendedToTheRight() ? "isExtendedToTheRight" : "notExtendedToTheRight");
+		;
         return string;
     };
 };
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java
index f100853..c5aac76 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java
@@ -18,7 +18,6 @@
 
 /*
   TODO: Make isNegative work!
-  TODO: Make isEmpty work!
   TODO: Evaluate if spanNext(spanNext(a,b),spanNext(c,d)) is faster
         than spanNext(spanNext(spanNext(a,b),c),d)
   TODO: Improve support for SpanElementQueryWrapper in constraints!
@@ -728,7 +727,7 @@
         if (query == null)
             return (SpanQuery) null;
 
-        // NextQueries:
+        // NextQueries
         if (!this.hasConstraints() && this.isInOrder()) {
             for (; i < this.segments.size(); i++) {
 
@@ -742,6 +741,7 @@
             return (SpanQuery) query;
         };
 
+		// DistanceQueries with problems
         if (this.hasConstraints() && this.isProblematic) {
             throw new QueryException(613,
                     "Distance constraints not supported with empty, optional or negative operands");
@@ -960,7 +960,7 @@
     };
 
 
-    // Todo: Deal with negative and optional!
+    // Todo: Deal with negative, empty and optional!
     // [base=der][base!=Baum]?
     private SpanQueryWrapper _merge (SpanQueryWrapper anchor,
             SpanQueryWrapper problem, boolean mergeLeft) throws QueryException {
@@ -980,15 +980,46 @@
                 log.trace("Problem is empty with class {}",
                         problem.getClassNumber());
 
-            query = new SpanExpansionQuery(anchor.retrieveNode(
-                    this.retrieveNode).toFragmentQuery(),
-                    problem.isOptional() ? 0 : problem.getMin(),
-                    problem.getMax(), direction,
-                    problem.hasClass() ? problem.getClassNumber() : (byte) 0,
-                    true);
-            SpanQueryWrapper sqw = new SpanSimpleQueryWrapper(query)
-                    .isExtended(true);
+			// Merge extensions!
+			if (!problem.hasClass() &&
+				!anchor.hasClass() &&
+				anchor.isExtended()) {
 
+				if (DEBUG)
+					log.trace("It may be possible to extend anchor with problem");
+				
+				if (
+					// Further extend to the right ...
+					(direction >= 0 && anchor.isExtendedToTheRight) ||
+
+					// or the left
+					(direction < 0 && !anchor.isExtendedToTheRight)) {
+
+					if (DEBUG)
+						log.trace("Readjust min and max");
+
+					// Readjust the anchor
+					anchor.setMin(anchor.getMin() + problem.getMin());
+					anchor.setMax(anchor.getMax() + problem.getMax());
+
+					/*
+					 * This is wrong - min is only relevant for extensions
+					if (anchor.getMin() > 0)
+						anchor.isOptional = false;
+					*/					
+					return anchor;
+				};
+			};
+
+			// Can't merge extensions
+			SpanQueryWrapper sqw = new SpanExpansionQueryWrapper(
+				anchor,
+				problem.isOptional() ? 0 : problem.getMin(),
+				problem.getMax(),
+				direction,
+				problem.hasClass() ? problem.getClassNumber() : (byte) 0
+				).isExtended(true);
+				
             // Set right extension
             if (direction >= 0)
                 sqw.isExtendedToTheRight(true);
@@ -1005,15 +1036,17 @@
                 log.trace("Problem is negative with class {}",
                         problem.getClassNumber());
 
-            query = new SpanExpansionQuery(anchor.retrieveNode(
-                    this.retrieveNode).toFragmentQuery(), problem.retrieveNode(
-                    this.retrieveNode).toFragmentQuery(), problem.getMin(),
-                    problem.getMax(), direction,
-                    problem.hasClass() ? problem.getClassNumber() : (byte) 0,
-                    true);
+			// TODO: Should probably wrapped as well!
+			// A sequence of negative tokens may expand jointly!
+			query = new SpanExpansionQuery(anchor.retrieveNode(
+											   this.retrieveNode).toFragmentQuery(), problem.retrieveNode(
+												   this.retrieveNode).toFragmentQuery(), problem.getMin(),
+										   problem.getMax(), direction,
+										   problem.hasClass() ? problem.getClassNumber() : (byte) 0,
+										   true);
 
             SpanQueryWrapper sqw = new SpanSimpleQueryWrapper(query)
-                    .isExtended(true);
+				.isExtended(true);
 
             // Set right extension
             if (direction >= 0)
@@ -1025,24 +1058,29 @@
         if (DEBUG)
             log.trace("Problem is optional");
 
+		// [base=der][][base=Baum]?
         // [base=der][base=baum]?
 
         // [base=der]
-        SpanAlterQueryWrapper saqw = new SpanAlterQueryWrapper(this.field,
-                anchor);
+        SpanAlterQueryWrapper saqw = new SpanAlterQueryWrapper(
+			this.field,
+			anchor
+			);
 
         // [base=der]
         SpanSequenceQueryWrapper ssqw = new SpanSequenceQueryWrapper(
-                this.field, anchor);
+			this.field,
+			anchor
+			);
 
         // [base=der][base=baum]
-        if (mergeLeft)
-            ssqw.append(new SpanSimpleQueryWrapper(problem.retrieveNode(
-                    this.retrieveNode).toFragmentQuery()));
+	    if (mergeLeft) {
+            ssqw.append(problem.isOptional(false));
+		}
         // [base=baum][base=der]
-        else
-            ssqw.prepend(new SpanSimpleQueryWrapper(problem.retrieveNode(
-                    this.retrieveNode).toFragmentQuery()));
+        else {
+            ssqw.prepend(problem.isOptional(false));
+		}
 
         saqw.or(ssqw);
 
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSimpleQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSimpleQueryWrapper.java
index 61608e5..3d9c76a 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSimpleQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSimpleQueryWrapper.java
@@ -3,6 +3,7 @@
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
+import de.ids_mannheim.korap.util.QueryException;
 
 public class SpanSimpleQueryWrapper extends SpanQueryWrapper {
     private SpanQuery query;
@@ -25,6 +26,20 @@
         this.query = query;
     };
 
+	// This is similar to a clone
+	public SpanSimpleQueryWrapper (SpanQueryWrapper query) throws QueryException {
+		this.hasClass = query.hasClass();
+		this.isOptional = query.isOptional();
+		this.isNegative = query.isNegative();
+		this.isEmpty = query.isEmpty();
+		this.isExtended = query.isExtended();
+		this.isExtendedToTheRight = query.isExtendedToTheRight();
+		this.maybeUnsorted = query.maybeUnsorted();
+		this.retrieveNode = query.retrieveNode;
+		this.query = query.toFragmentQuery();
+		this.isNull = query.isNull();
+    };
+
 
     @Override
     public SpanQuery toFragmentQuery () {
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java b/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
index 09d690b..27f00f3 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
@@ -601,16 +601,16 @@
 			"focus(254: spanContain(<tokens:base/s:t />, {254: spanExpansion(tokens:s:der, []{1, 1}, right)}))");
     };
 
-	@Ignore
+	@Test
     public void queryJSONregexRewrite2 () throws QueryException {
-        // "der" [.+?]
+        // "der" [.*] [.*?] [.+] [.+?]
 		String json = getString(getClass().getResource(
 									"/queries/sequence/regex-rewrite-2.jsonld").getFile());
 		KrillQuery kq = new KrillQuery("tokens");
 
 		assertEquals(
 			kq.fromKoral(json).toQuery().toString(),
-			"focus(254: spanContain(<tokens:base/s:t />, {254: spanExpansion(tokens:s:der, []{1, 4}, right)}))");
+			"focus(254: spanContain(<tokens:base/s:t />, {254: spanExpansion(tokens:s:der, []{4, 4}, right)}))");
     };
 
 
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestSpanSequenceQueryJSON.java b/src/test/java/de/ids_mannheim/korap/query/TestSpanSequenceQueryJSON.java
index 97a9eb2..aff5cf8 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestSpanSequenceQueryJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestSpanSequenceQueryJSON.java
@@ -373,8 +373,8 @@
         // Sonne [] Mond?
         SpanQueryWrapper sqwi = jsonQueryFile("empty-followed-by-optionality.jsonld");
         assertEquals(
-                "focus(254: spanContain(<tokens:base/s:t />, {254: spanOr([spanExpansion(tokens:s:Sonne, []{1, 1}, right), spanNext(spanExpansion(tokens:s:Sonne, []{1, 1}, right), tokens:s:Mond)])}))",
-                sqwi.toQuery().toString());
+			"focus(254: spanContain(<tokens:base/s:t />, {254: spanOr([spanExpansion(tokens:s:Sonne, []{1, 1}, right), spanNext(spanExpansion(tokens:s:Sonne, []{1, 1}, right), tokens:s:Mond)])}))",
+			sqwi.toQuery().toString());
         // Could also be a distance at the end ... that's a query planner thing.
     };