SpanSegmentQuery. Abstract classes for SpanSegmentQuery and SpanNextQuery.

commit: ed3bb3b392f5e1cd71645c9b2c566374c2a94888 [log] [tgz]
author: Eliza Margaretha <margaretha@ids-mannheim.de> Tue Jan 14 10:53:56 2014 +0000
committer: Eliza Margaretha <margaretha@ids-mannheim.de> Tue Jan 14 10:53:56 2014 +0000
tree: e5b7fce7618f97ddf73943b79a2a250e77b21373
parent: 138e5b9fff6f90cdc0e9427e0a2ebeae13765890 [diff]
diff --git a/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java b/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java
new file mode 100644
index 0000000..f0d8589
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java

@@ -0,0 +1,90 @@
+package de.ids_mannheim.korap.query;
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.util.ToStringUtils;
+
+/** An abstract class for Spanquery having two clauses. 
+ * 
+ * 	@author margaretha
+ * */
+public abstract class SimpleSpanQuery extends SpanQuery implements Cloneable{		
+	
+	private SpanQuery firstClause, secondClause;
+	private String field;
+	private String spanName;
+    
+    public SimpleSpanQuery(SpanQuery firstClause, SpanQuery secondClause, String spanName) {
+    	this.field = secondClause.getField();
+    	if (!firstClause.getField().equals(field)){
+    		throw new IllegalArgumentException("Clauses must have the same field.");
+    	}    	
+    	this.setFirstClause(firstClause);
+    	this.setSecondClause(secondClause);    	
+    	this.spanName=spanName;
+	}  
+    	
+	@Override
+	public String getField() {
+		return field;
+	}
+
+	@Override
+	public String toString(String field) {
+		StringBuilder sb = new StringBuilder();
+		sb.append(this.spanName);
+		sb.append("(");
+		sb.append(firstClause.toString(field));
+	        sb.append(", ");
+		sb.append(secondClause.toString(field));
+		sb.append(")");
+		sb.append(ToStringUtils.boost(getBoost()));
+		return sb.toString();		
+	}
+
+	public SpanQuery getFirstClause() {
+		return firstClause;
+	}
+
+	public void setFirstClause(SpanQuery firstClause) {
+		this.firstClause = firstClause;
+	}
+
+	public SpanQuery getSecondClause() {
+		return secondClause;
+	}
+
+	public void setSecondClause(SpanQuery secondClause) {
+		this.secondClause = secondClause;
+	}
+	
+	// For rewriting fuzzy searches like wildcard and regex
+	
+	@Override
+    public void extractTerms(Set<Term> terms) {
+		firstClause.extractTerms(terms);
+		secondClause.extractTerms(terms);
+    };
+    
+	@Override
+	public Query rewrite(IndexReader reader) throws IOException {		
+		SimpleSpanQuery clone = clone();
+		SpanQuery query = (SpanQuery) firstClause.rewrite(reader);
+		if (!query.equals(firstClause)) {
+	    	clone.firstClause = query;
+		}		
+		query = (SpanQuery) secondClause.rewrite(reader);
+		if (!query.equals(secondClause)) {		    
+		    clone.secondClause = query;
+		}
+		return (clone != null ? clone : this );		
+	}	
+	
+	public abstract SimpleSpanQuery clone();	
+	
+}

diff --git a/src/main/java/de/ids_mannheim/korap/query/SpanNextQuery.java b/src/main/java/de/ids_mannheim/korap/query/SpanNextQuery.java
index 0fd90e8..32a90d9 100644
--- a/src/main/java/de/ids_mannheim/korap/query/SpanNextQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/query/SpanNextQuery.java

@@ -30,99 +30,35 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-/** Matches spans which are directly next to each other.
+/** Matches spans which are directly next to each other. 
+ * 	This is identical to a phrase query with exactly two clauses. 
  */
-public class SpanNextQuery extends SpanQuery implements Cloneable {
-    private SpanQuery firstClause, secondClause;
-    public String field;
+public class SpanNextQuery extends SimpleSpanQuery implements Cloneable {
+    private SpanQuery firstClause;
+    private SpanQuery secondClause;    
     private boolean collectPayloads;
 
-    // Logger
-    private final static Logger log = LoggerFactory.getLogger(SpanNextQuery.class);
-
     // Constructor
     public SpanNextQuery(SpanQuery firstClause, SpanQuery secondClause) {
-	this(firstClause, secondClause, true);
+    	this(firstClause, secondClause, true);
     };
 
     // Constructor  
-    public SpanNextQuery(SpanQuery firstClause,
-			 SpanQuery secondClause,
-			 boolean collectPayloads) {
-
-	this.field = secondClause.getField();
-	if (!firstClause.getField().equals(field)) {
-	    throw new IllegalArgumentException("Clauses must have same field");
-	};
-
-	this.collectPayloads = collectPayloads;
-	this.firstClause = firstClause;
-	this.secondClause = secondClause;
+    public SpanNextQuery(SpanQuery firstClause, SpanQuery secondClause,
+		boolean collectPayloads) {   
+    	super(firstClause, secondClause, "spanNext");
+		this.collectPayloads = collectPayloads;
+		this.firstClause = firstClause;
+		this.secondClause = secondClause;
     };
 
 
     @Override
-    public String getField() { return field; }
-
-    public SpanQuery firstClause() { return firstClause; };
-
-    public SpanQuery secondClause() { return secondClause; };
-  
-    @Override
-    public void extractTerms(Set<Term> terms) {
-	firstClause.extractTerms(terms);
-	secondClause.extractTerms(terms);
+    public Spans getSpans (final AtomicReaderContext context, Bits acceptDocs,
+		   Map<Term,TermContext> termContexts) throws IOException {	
+		return (Spans) new NextSpans (this, context, acceptDocs, 
+				termContexts, collectPayloads);
     };
-  
-
-    @Override
-    public String toString(String field) {
-	StringBuilder sb = new StringBuilder();
-	sb.append("spanNext(")
-          .append(firstClause.toString(field))
-          .append(", ")
-          .append(secondClause.toString(field))
-          .append(")")
-          .append(ToStringUtils.boost(getBoost()));
-	return sb.toString();
-    };
-
-    @Override
-    public Spans getSpans (final AtomicReaderContext context,
-			   Bits acceptDocs,
-			   Map<Term,TermContext> termContexts) throws IOException {
-
-	log.trace("Get Spans");
-	return (Spans) new NextSpans (
-	    this, context, acceptDocs, termContexts, collectPayloads
-	);
-    };
-
-    @Override
-    public Query rewrite (IndexReader reader) throws IOException {
-	SpanNextQuery clone = null;
-
-	SpanQuery query = (SpanQuery) firstClause.rewrite(reader);
-
-	if (query != firstClause) {
-	    if (clone == null)
-		clone = this.clone();
-	    clone.firstClause = query;
-	};
-
-	query = (SpanQuery) secondClause.rewrite(reader);
-	if (query != secondClause) {
-	    if (clone == null)
-		clone = this.clone();
-	    clone.secondClause = query;
-	};
-
-	if (clone != null)
-	    return clone;
-
-	return this;
-    };
-  
 
     @Override
     public SpanNextQuery clone() {
@@ -145,7 +81,7 @@
 	final SpanNextQuery spanNextQuery = (SpanNextQuery) o;
 	
 	if (collectPayloads != spanNextQuery.collectPayloads) return false;
-	if (!firstClause.equals(spanNextQuery.firstClause))   return false;
+	if (!firstClause.equals(spanNextQuery.firstClause)) return false;
 	if (!secondClause.equals(spanNextQuery.secondClause)) return false;
 
 	return getBoost() == spanNextQuery.getBoost();

diff --git a/src/main/java/de/ids_mannheim/korap/query/SpanSegmentQuery.java b/src/main/java/de/ids_mannheim/korap/query/SpanSegmentQuery.java
new file mode 100644
index 0000000..3e52cbf
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/SpanSegmentQuery.java

@@ -0,0 +1,77 @@
+package de.ids_mannheim.korap.query;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.util.Bits;
+
+import de.ids_mannheim.korap.query.spans.SegmentSpans;
+
+public class SpanSegmentQuery extends SimpleSpanQuery{
+	
+	private boolean collectPayloads;
+	private SpanQuery firstClause, secondClause;
+	
+	public SpanSegmentQuery(SpanQuery firstClause, SpanQuery secondClause) {
+		this(firstClause,secondClause,true);
+	}
+	
+	public SpanSegmentQuery(SpanQuery firstClause, SpanQuery secondClause, 
+			boolean collectPayloads) { 
+    	super(firstClause,secondClause,"spanSegment");
+    	this.collectPayloads = collectPayloads;
+    	this.firstClause=firstClause;
+    	this.secondClause=secondClause;
+	}
+	
+	@Override
+	public Spans getSpans(AtomicReaderContext context, Bits acceptDocs, 
+			Map<Term, TermContext> termContexts) throws IOException {
+		return (Spans) new SegmentSpans(this, context, acceptDocs,
+				termContexts, collectPayloads);
+	}
+
+	@Override
+	public SimpleSpanQuery clone() {
+		SpanSegmentQuery spanSegmentQuery = new SpanSegmentQuery(
+			    (SpanQuery) firstClause.clone(),
+			    (SpanQuery) secondClause.clone(),
+			    this.collectPayloads
+		        );
+		spanSegmentQuery.setBoost(getBoost());
+		return spanSegmentQuery;		
+	}
+	
+	/* TODO: Where is the hashmap?
+		
+    @Override
+    public boolean equals(Object o) {
+		if (this == o) return true;
+		if (!(o instanceof SpanNextQuery)) return false;
+		
+		final SpanNextQuery spanNextQuery = (SpanNextQuery) o;
+		
+		if (collectPayloads != spanNextQuery.collectPayloads) return false;
+		if (!firstClause.equals(spanNextQuery.firstClause)) return false;
+		if (!secondClause.equals(spanNextQuery.secondClause)) return false;
+	
+		return getBoost() == spanNextQuery.getBoost();
+    };
+
+
+    // I don't know what I am doing here
+    @Override
+    public int hashCode() {
+		int result;
+		result = firstClause.hashCode() + secondClause.hashCode();
+		result ^= (result << 31) | (result >>> 2);  // reversible
+		result += Float.floatToRawIntBits(getBoost());
+		return result;
+    };
+    */
+}

diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
index b6080fe..e594c7f 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java

@@ -1,408 +1,52 @@
 package de.ids_mannheim.korap.query.spans;
 
-/* Inspired by NearSpansOrdered
- *
- * REIMPLEMENTATION
- *
- */
+import java.io.IOException;
+
+import java.util.Map;
 
 import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermContext;
-import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Bits;
-import org.apache.lucene.search.spans.Spans;
-import org.apache.lucene.search.spans.SpanQuery;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Collection;
-import java.util.Map;
-import java.util.Set;
-
-import de.ids_mannheim.korap.query.SpanNextQuery;
-
-// Todo: Disable the option to discard payloads
-
-import java.util.*;
-import java.io.*;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/** From Spans.java:
- * Expert: an enumeration of span matches.  Used to implement span searching.
- * Each span represents a range of term positions within a document.  Matches
- * are enumerated in order, by increasing document number, within that by
- * increasing start position and finally by increasing end position. */
-public class NextSpans extends Spans {
-    private boolean firstTime = true;
-    private boolean more = false;
-
-    // Initialize as invalid
-    private int matchDoc   = -1;
-    private int matchStart = -1;
-    private int matchEnd   = -1;
-
-    /** Indicates that all both spans have the same doc() */
-    private boolean inSameDoc = false;
-
-    // First span
-    private final Spans firstSpans;
-    private final Spans firstSpansByDoc;
-
-    // Second span
-    private final Spans secondSpans;
-    private final Spans secondSpansByDoc;
-
-    private SpanNextQuery query;
-
-    private List<byte[]> matchPayload;
-    private boolean collectPayloads = true;
-
-    private final static Logger log = LoggerFactory.getLogger(NextSpans.class);
-
-    // Constructor
-    public NextSpans (SpanNextQuery spanNextQuery,
-		      AtomicReaderContext context,
-		      Bits acceptDocs,
-		      Map<Term,TermContext> termContexts) throws IOException {
-	this(spanNextQuery, context, acceptDocs, termContexts, true);
-    };
-
-    // Constructor
-    public NextSpans (SpanNextQuery spanNextQuery,
-		      AtomicReaderContext context,
-		      Bits acceptDocs,
-		      Map<Term,TermContext> termContexts,
-		      boolean collectPayloads) throws IOException {
-
-	log.trace("Init NextSpans");
-
-	//	this.collectPayloads = collectPayloads;
-
-	// Init copies
-	matchPayload = new LinkedList<byte[]>();
-
-	firstSpans = spanNextQuery.firstClause().getSpans(
-	    context, acceptDocs, termContexts
-        );
-	firstSpansByDoc = firstSpans; // used in toSameDoc()
-
-	secondSpans = spanNextQuery.secondClause().getSpans(
-            context, acceptDocs, termContexts
-        );
-	secondSpansByDoc = secondSpans; // used in toSameDoc()
-
-	/*
-	if (DEBUG) {
-	    System.err.println("***");
-	    while (subSpans[i].next()) {
-		StringBuffer payloadString = new StringBuffer();
-		int docid = subSpans[i].doc();
-		System.err.println("Span: "+i+" Doc: " + docid + " with " + subSpans[i].start() + "-" + subSpans[i].end() + " || " + payloadString.toString());
-	    };
-	};
-	*/
-	query = spanNextQuery; // kept for toString() only.
-    };
 
 
-    /** Move to the next match, returning true iff any such exists. */
-    @Override
-    public boolean next () throws IOException {
-	log.trace("Next with doc {}", matchDoc);
+import de.ids_mannheim.korap.query.SimpleSpanQuery;
 
-	// Check for init next
-	if (firstTime) {
-	    log.trace("First retrieval of NextSpans");
-	    firstTime = false;
-	    if (!firstSpans.next() || !secondSpans.next()) {
-		log.trace("No next in firstSpan nor in secondSpan");
-		more = false;
-		return false;
-	    };
-	    log.trace("Spans are initialized");
-	    more = true;
-	};
-
-	//	if (collectPayloads)
-	    matchPayload.clear();
-
-	return advance();
-    };
-
-
-    /** Skips to the first match beyond the current, whose document number is
-     * greater than or equal to <i>target</i>. <p>Returns true iff there is such
-     * a match.  <p>Behaves as if written: <pre class="prettyprint">
-     *   boolean skipTo(int target) {
-     *     do {
-     *       if (!next())
-     *         return false;
-     *     } while (target > doc());
-     *     return true;
-     *   }
-     * </pre>
-     * Most implementations are considerably more efficient than that.
-     */
-    public boolean skipTo (int target) throws IOException {
-	log.trace("skipTo {}", target);
-
-	// Check for init next
-	if (firstTime) {
-	    firstTime = false;
-	    if (!firstSpans.next() && !secondSpans.next()) {
-		more = false;
-		return false;
-	    };
-	    more = true;
-	}
-
-	// There are more spans, but the doc has to be skipped to target
-	// Warning: This only skips firstSpans!
-	//          Maybe that's wrong ...
-	else if (more && (firstSpans.doc() < target)) {
-	    if (firstSpans.skipTo(target)) {
-		inSameDoc = false;
-	    }
-
-	    else {
-		more = false;
-		return false;
-	    };
-	};
-
-	//	if (collectPayloads)
-	    matchPayload.clear();
-
-	return advance();
-    };
-
-
-    /** Advance the subSpans to the same document */
-    private boolean toSameDoc() throws IOException {
-	log.trace("toSameDoc");
-
-	if (firstSpansByDoc.doc() < secondSpansByDoc.doc()) {
-	    if (!firstSpansByDoc.skipTo(secondSpansByDoc.doc())) {
-		more = false;
-		inSameDoc = false;
-		return false;
-	    };
-	}
-	else if (firstSpansByDoc.doc() > secondSpansByDoc.doc()) {
-	    if (!secondSpansByDoc.skipTo( firstSpansByDoc.doc() )) {
-		more = false;
-		inSameDoc = false;
-		return false;
-	    };
-	};
-	inSameDoc = true;
-	return true;
-    };
-
-
-    /** Advances the subSpans to just after an ordered match with a minimum slop
-     * that is smaller than the slop allowed by the SpanNearQuery.
-     * @return true iff there is such a match.
-     */
-    private boolean advance() throws IOException {
-	log.trace("advance");
-	boolean match = false;
-
-	// There are more spans, and both spans are either in the
-	// same doc or can be forwarded to the same doc.
-	while (more && (inSameDoc || toSameDoc())) {
-
-	    log.trace("More spans in the same Doc: {}", firstSpansByDoc.doc());
-	    
-	    /* spans are in the same doc and in the correct order next to each other */
-	    if (match()) {
-
-		// start and end position of last span
-		matchStart = firstSpans.start();
-		matchEnd = secondSpans.end();
-
-		log.trace("Matching: {}-{}", matchStart, matchEnd);
-
-		log.trace("Check for payloads");
-
-
-		//		if (collectPayloads) {
-		    log.trace("copy payloads");
-
-		    if (firstSpans.isPayloadAvailable()) {
-			Collection<byte[]> payload = firstSpans.getPayload();
-			log.trace("Found {} payloads in firstSpans", payload.size());
-			matchPayload.addAll(payload);
-		    };
-		    if (secondSpans.isPayloadAvailable()) {
-			Collection<byte[]> payload = secondSpans.getPayload();
-			log.trace("Found {} payloads in secondSpans", payload.size());
-			matchPayload.addAll(payload);
-		    };
-		    //		};
-
-		log.trace("=> MATCH");
-		match = true;
-		break;
-	    };
-	};
-
-	log.trace("Forward secondSpans");
-	if (!secondSpans.next()) {
-	    log.trace("No more secondSpans");
-	    more = false;
-	};
-	inSameDoc = false;
-	return match;
-    };
-
-
-    /** Returns the document number of the current match.  Initially invalid. */
-    @Override
-    public int doc () {
-	return matchDoc;
-    };
-
-    /** Returns the start position of the current match.  Initially invalid. */
-    @Override
-    public int start () {
-	return matchStart;
-    };
-
-    /** Returns the end position of the current match.  Initially invalid. */
-    @Override
-    public int end () {
-	return matchEnd;
-    };
-
-    /**
-     * Returns the payload data for the current span.
-     * This is invalid until {@link #next()} is called for
-     * the first time.
-     * This method must not be called more than once after each call
-     * of {@link #next()}. However, most payloads are loaded lazily,
-     * so if the payload data for the current position is not needed,
-     * this method may not be called at all for performance reasons. An ordered
-     * SpanQuery does not lazy load, so if you have payloads in your index and
-     * you do not want ordered SpanNearQuerys to collect payloads, you can
-     * disable collection with a constructor option.<br>
-     * <br>
-     * Note that the return type is a collection, thus the ordering should not be relied upon.
-     * <br/>
-     * @lucene.experimental
-     *
-     * @return a List of byte arrays containing the data of this payload, otherwise null if isPayloadAvailable is false
-     * @throws IOException if there is a low-level I/O error
-     */
-    // public abstract Collection<byte[]> getPayload() throws IOException;
-    @Override
-    public Collection<byte[]> getPayload() throws IOException {
-	log.trace("Payload is requested with payload count {}", matchPayload.size());
-	return matchPayload;
-    };
+/**	NextSpans is an enumeration of Span matches, which ensures that  
+ * 	a span is immediately followed by another span. 
+ * 
+ * 	@author margaretha 
+ * */
+public class NextSpans extends SimpleSpans {	
+	
+    public NextSpans (SimpleSpanQuery simpleSpanQuery,
+  	      AtomicReaderContext context,
+  	      Bits acceptDocs,
+  	      Map<Term,TermContext> termContexts) throws IOException {
+    	this(simpleSpanQuery, context, acceptDocs, termContexts, true);    	
+    }
     
-
-    /**
-     * Checks if a payload can be loaded at this position.
-     * <p/>
-     * Payloads can only be loaded once per call to
-     * {@link #next()}.
-     *
-     * @return true if there is a payload available at this position that can be loaded
-     */
-    @Override
-    public boolean isPayloadAvailable() {
-	log.trace("Check for payload emptyness: {}", matchPayload.isEmpty());
-
-	return matchPayload.isEmpty() == false;
-    };
-
-
-    // Todo: This may be in the wrong version
-    @Override
-    public long cost() {
-	return Math.min(firstSpans.cost(), secondSpans.cost());
-    };
-
-
-    @Override
-    public String toString() {
-	return getClass().getName() + "("+query.toString()+")@"+
-	    (firstTime?"START":(more?(doc()+":"+start()+"-"+end()):"END"));
-    };
-
-
-    public boolean match () throws IOException {
-	matchDoc = firstSpans.doc();
-	log.trace("Check for next match");
-
-	byte check;
-	while (inSameDoc && ((check = docNext(firstSpans, secondSpans)) != (byte) 0)) {
-
-	    log.trace("There's no match");
-
-	    if ((check == (byte) -1) && !secondSpans.next()) {
-		log.trace("No more secondSpans");
-		inSameDoc = false;
-		more = false;
-		break;
-	    }
-	    else if (check == (byte) 1 && !firstSpans.next()) {
-		log.trace("No more firstSpans");
-		inSameDoc = false;
-		more = false;
-		break;
-	    }
-	    else if (matchDoc != secondSpans.doc()) {
-		log.trace("secondSpans has another doc");
-		inSameDoc = false;
-		break;
-	    };
-	};
-	return inSameDoc;
-    };
-
-
-    /** Check whether two Spans in the same document are ordered.
-     * @return true iff spans1 starts before spans2
-     *              or the spans start at the same position,
-     *              and spans1 ends before spans2.
-     */
-    static final byte docNext (Spans spans1, Spans spans2) {
-	// check does
-	int start1 = spans1.start();
-	int start2 = spans2.start();
-
-	//	boolean val = (start1 == start2) ? (spans1.end() < spans2.end()) : (start1 < start2);
-	byte val;
-	if (start1 >= start2) {
-	    val = (byte) -1;
+    public NextSpans (SimpleSpanQuery simpleSpanQuery,
+	      AtomicReaderContext context,
+	      Bits acceptDocs,
+	      Map<Term,TermContext> termContexts,
+	      boolean collectPayloads) throws IOException {		 			
+		super(simpleSpanQuery, context, acceptDocs, termContexts,collectPayloads);
 	}
-	else {
-	    int end1 = spans1.end();
-	    if (end1 == start2) {
-		val = (byte) 0;
-	    }
-	    else if (end1 > start2) {
-		val = (byte) -1;
-	    }
-	    else {
-		val = (byte) 1;
-	    };
-	}
-	// -1: forward secondSpans
-	// 1: forward firstSpans
 
-	log.trace("{}-{} next to {}-{}", start1, spans1.end(), start2, spans2.end());
-	log.trace("docSpansOrdered: {}", val);
-
-	return val;
-    };
-};
+    /** Check weather the end position of the current firstspan equals 
+     *  the start position of the secondspan. 
+  	 * */
+	protected int findMatch() {		
+		if (firstSpans.end() == secondSpans.start()) {			
+			matchDocNumber = firstSpans.doc();
+			matchStartPosition = firstSpans.start();
+			matchEndPosition = secondSpans.end();	
+			return 0;
+		}		
+		else if (firstSpans.end() > secondSpans.start())
+			return 1;
+		
+		return -1;		
+	}	
+}

diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/SegmentSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/SegmentSpans.java
new file mode 100644
index 0000000..a401b0f
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/SegmentSpans.java

@@ -0,0 +1,52 @@
+package de.ids_mannheim.korap.query.spans;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.util.Bits;
+
+import de.ids_mannheim.korap.query.SimpleSpanQuery;
+
+/**	SegmentSpans is an enumeration of Span matches, which ensures that two spans: 
+ * 	a firstspan and a secondspan have exactly the same start and end positions.
+ * 
+ * 	@author margaretha 
+ * */
+public class SegmentSpans extends SimpleSpans {	
+	
+    public SegmentSpans (SimpleSpanQuery simpleSpanQuery,
+  	      AtomicReaderContext context,
+  	      Bits acceptDocs,
+  	      Map<Term,TermContext> termContexts) throws IOException {
+    	this(simpleSpanQuery, context, acceptDocs, termContexts, true);    	
+    }
+    
+    public SegmentSpans (SimpleSpanQuery simpleSpanQuery,
+	      AtomicReaderContext context,
+	      Bits acceptDocs,
+	      Map<Term,TermContext> termContexts,
+	      boolean collectPayloads) throws IOException {		 			
+		super(simpleSpanQuery, context, acceptDocs, termContexts,collectPayloads);
+	}
+
+    /** Check weather the start and end positions of the current 
+     * 	firstspan and secondspan are identical. 
+  	 * */
+	protected int findMatch() {
+		
+		if (firstSpans.start() == secondSpans.start() &&
+			firstSpans.end() == secondSpans.end() ){
+			matchDocNumber = firstSpans.doc();
+			matchStartPosition = firstSpans.start();
+			matchEndPosition = firstSpans.end();			
+			return 0;
+		}
+		else if (firstSpans.start() < secondSpans.start())
+			return -1;
+		
+		return 1;
+	}	
+}

diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java
new file mode 100644
index 0000000..4169729
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/SimpleSpans.java

@@ -0,0 +1,198 @@
+package de.ids_mannheim.korap.query.spans;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.util.Bits;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import de.ids_mannheim.korap.query.SimpleSpanQuery;
+
+/** An abstract class for Span enumeration whose two child spans are matched by 
+ * 	their positions and do not have a partial overlap.
+ *  
+ * 	@author margaretha
+ * 
+ * */
+public abstract class SimpleSpans extends Spans{
+	private boolean isStartEnumeration;
+	private boolean hasMoreSpans;
+	protected int matchDocNumber, matchStartPosition, matchEndPosition;	
+	private List<byte[]> matchPayload;
+    private boolean collectPayloads;  
+    
+	// Warning: enumeration of Spans
+	protected Spans firstSpans, secondSpans;
+	private SimpleSpanQuery query;	  
+    
+    private Logger log = LoggerFactory.getLogger(SimpleSpans.class);
+      
+    public SimpleSpans (SimpleSpanQuery simpleSpanQuery,
+  	      AtomicReaderContext context,
+  	      Bits acceptDocs,
+  	      Map<Term,TermContext> termContexts,
+  	      boolean collectPayloads) throws IOException {
+    	  
+    	// Initialize as invalid
+  		matchDocNumber= -1;
+  		matchStartPosition= -1;
+  		matchEndPosition= -1;
+  				
+  		this.collectPayloads = collectPayloads; 
+  		if (collectPayloads) 
+  			matchPayload = new LinkedList<byte[]>();
+  		
+  		// Get the enumeration of the two spans to match
+  		firstSpans = simpleSpanQuery.getFirstClause().
+  			getSpans(context, acceptDocs, termContexts);
+  		secondSpans = simpleSpanQuery.getSecondClause().
+  			getSpans(context, acceptDocs, termContexts);
+  	
+  		query = simpleSpanQuery;		
+  		hasMoreSpans = secondSpans.next();
+  		isStartEnumeration=true;
+      }
+      
+    @Override
+  	public boolean next() throws IOException {
+    	// Warning: this does not work for overlapping spans 
+    	// e.g. get multiple second spans in a firstspan
+  		hasMoreSpans &= firstSpans.next();
+  		isStartEnumeration=false;
+  		matchPayload.clear();
+  		return advance();  		
+  	}	
+  	
+  	/** Advance is a lucene terminology to search for the next match.
+  	 * */
+    private boolean advance() throws IOException {	    	
+		// The complexity is linear for searching in a document. 
+		// It's better if we can skip to >= position in a document.
+    	while (hasMoreSpans && ensureSameDoc()){
+    		int matchCase = findMatch();
+			if (matchCase == 0){
+				log.trace("Match doc#: {}",matchDocNumber);
+				log.trace("Match positions: {}-{}", matchStartPosition, 
+						matchEndPosition);
+				doCollectPayloads();
+				return true;
+			} 
+			else if (matchCase == 1) {
+				hasMoreSpans = secondSpans.next();			
+			}			
+			else{ 
+				hasMoreSpans = firstSpans.next();
+			}
+		}
+		return false;
+	}	
+        
+    /** Specify the condition for a match 
+     * @return 0 iff match is found,
+     * 			-1 to advance the firstspan,		
+     * 			1 to advance the secondspan
+     * */
+  	protected abstract int findMatch();
+
+  	
+  	/** If the current firstspan and secondspan are not in the same document,
+  	 * 	try to skip the span with the smaller document number, to the same 
+  	 * 	OR a greater document number than, the document number of the other 
+  	 * 	span. Do this until the firstspan and the secondspan are in the same 
+  	 * 	doc, OR until reaching the last document. 
+  	 *	@return true iff such a document exists.
+  	 * */
+  	private boolean ensureSameDoc() throws IOException {		
+  		while (firstSpans.doc() != secondSpans.doc()) {
+  			if (firstSpans.doc() < secondSpans.doc()){
+  				if (!firstSpans.skipTo(secondSpans.doc())){
+  					hasMoreSpans = false;
+  					return false;
+  				}				
+  			}		
+  			else {
+  				if (!secondSpans.skipTo(firstSpans.doc())){
+  					hasMoreSpans = false;
+  					return false;
+  				}	
+  			}			
+  		}		
+  		return true;
+  	}
+  	
+  	/** Collecting available payloads from the current first and second spans */
+  	private void doCollectPayloads() throws IOException {
+  		if (collectPayloads){
+  			log.trace("Collect payloads");
+  		    if (firstSpans.isPayloadAvailable()) {
+  		    	Collection<byte[]> payload = firstSpans.getPayload();
+  		    	log.trace("Found {} payloads in firstSpans", payload.size());
+  		    	matchPayload.addAll(payload);
+  		    }
+  		    if (secondSpans.isPayloadAvailable()) {
+  		    	Collection<byte[]> payload = secondSpans.getPayload();
+  		    	log.trace("Found {} payloads in secondSpans", payload.size());
+  		    	matchPayload.addAll(payload);
+  		    }
+  		}
+  	}
+
+  	@Override
+  	public boolean skipTo(int target) throws IOException {		
+  		if (hasMoreSpans && (firstSpans.doc() < target)){
+  			if (!firstSpans.skipTo(target)){
+  				hasMoreSpans = false;
+  				return false;
+  			}			
+  		} 		
+  		matchPayload.clear();
+  		return advance();
+  	}
+
+  	@Override
+  	public int doc() {
+  		return matchDocNumber;
+  	}
+
+  	@Override
+  	public int start() {
+  		return matchStartPosition;
+  	}
+
+  	@Override
+  	public int end() {
+  		return matchEndPosition;
+  	}
+
+  	@Override
+  	public Collection<byte[]> getPayload() throws IOException {
+  		return matchPayload;
+  	}
+
+  	@Override
+  	public boolean isPayloadAvailable() throws IOException {
+  		return !matchPayload.isEmpty();
+  	}
+
+  	@Override
+  	public long cost() {
+  		return firstSpans.cost() + secondSpans.cost();
+  	}
+  	
+  	@Override
+  	public String toString() { // who does call this?				
+  		return getClass().getName() + "("+query.toString()+")@"+
+  		    (isStartEnumeration?"START":(hasMoreSpans?(doc()+":"+
+  		    start()+"-"+end()):"END"));
+  	}
+    
+}
commit	ed3bb3b392f5e1cd71645c9b2c566374c2a94888	[log] [tgz]
author	Eliza Margaretha <margaretha@ids-mannheim.de>	Tue Jan 14 10:53:56 2014 +0000
committer	Eliza Margaretha <margaretha@ids-mannheim.de>	Tue Jan 14 10:53:56 2014 +0000
tree	e5b7fce7618f97ddf73943b79a2a250e77b21373
parent	138e5b9fff6f90cdc0e9427e0a2ebeae13765890 [diff]