src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java - KorAP/Krill - Gitiles

 package de.ids_mannheim.korap.query.spans;

 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;

 import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermContext;
 import org.apache.lucene.search.spans.Spans;
 import org.apache.lucene.search.spans.TermSpans;
 import org.apache.lucene.util.Bits;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import de.ids_mannheim.korap.query.SpanElementQuery;

 /**
  * @author Nils Diewald, margaretha
  *
  * Use copyFrom instead of clone
  */
 public class ElementSpans extends SpansWithId {

 	private List<CandidateElementSpans> candidateList;
 	private int currentDoc, currentPosition;
 	private TermSpans termSpans;

 	private Logger logger = LoggerFactory.getLogger(ElementSpans.class);

 	public ElementSpans(SpanElementQuery spanElementQuery,
 			AtomicReaderContext context, Bits acceptDocs,
 			Map<Term, TermContext> termContexts) throws IOException {
 		super(spanElementQuery, context, acceptDocs, termContexts);
 		candidateList = new ArrayList<>();
 		termSpans = (TermSpans) firstSpans;
 		hasMoreSpans = termSpans.next();
 		if (hasMoreSpans) {
 			currentDoc = termSpans.doc();
 			currentPosition = termSpans.start();
 		}
 	}

 	@Override
 	public boolean next() throws IOException {
 		isStartEnumeration=false;
 		return advance();
 	}

 	/**	Get the next match by first checking the candidate match list
 	 * 	and setting the list when it is empty.
 	 * */
 	private boolean advance() throws IOException {
 		while(hasMoreSpans || !candidateList.isEmpty()){
 			if (!candidateList.isEmpty()){
 				CandidateElementSpans cs = candidateList.get(0);
 				this.matchDocNumber = cs.getDoc();
 				this.matchStartPosition = cs.getStart();
 				this.matchEndPosition = cs.getEnd();
 				this.matchPayload = cs.getPayloads();
 				//this.setElementRef(cs.getSpanId());
 				this.setSpanId(cs.getSpanId());
 				candidateList.remove(0);
 				return true;
 			}
 			else{
 			    //logger.info("Setting candidate list");
 				setCandidateList();
 				currentDoc = termSpans.doc();
 				currentPosition = termSpans.start();
 			}
 		}
 		return false;
 	}

 	/**	Collect all the elements in the same start position and sort them by
 	 * 	end position (smallest first).
 	 * */
 	private void setCandidateList() throws IOException {
 		while (hasMoreSpans &&	termSpans.doc() == currentDoc &&
 				termSpans.start() == currentPosition){
 			CandidateElementSpans cs = new CandidateElementSpans(termSpans,
 					spanId);
 					//elementRef);
 			readPayload(cs);
 			candidateList.add(cs);
 			hasMoreSpans = termSpans.next();
 		}
 		Collections.sort(candidateList);
 	}


 	/**	This method reads the payload of the termSpan and assigns the end
 	 * 	position and element ref to the candidate match. The character offset
 	 *  payload is set as the candidate match payload.
 	 *  <br/><br/>
 	 * 	<em>Note</em>: payloadbuffer should actually collects all other payload
 	 * 	beside end position and element ref, but KorapIndex identify element's
 	 * 	payload by its length (8), which is only the character offsets. So
 	 * 	these offsets are directly set as the candidate match payload.
 	 *
 	 * 	@author margaretha
 	 * */
 	private void readPayload(CandidateElementSpans cs) throws IOException {
 		List<byte[]> payload = (List<byte[]>) termSpans.getPayload();
 		int length = payload.get(0).length;
 		ByteBuffer bb = ByteBuffer.allocate(length);
 		bb.put(payload.get(0));

 	    if (!payload.isEmpty()) {
 			// set element end position from payload
 			cs.setEnd(bb.getInt(8));

 			if (hasSpanId){ // copy element id
 				cs.setSpanId(bb.getShort(12));
 			}
 			else{ // set element id -1
 				cs.setSpanId((short) -1);
 			}
 			// Copy the start and end character offsets
 			byte[] b = new byte[8];
 			b = Arrays.copyOfRange(bb.array(), 0, 8);
 			cs.setPayloads(Collections.singletonList(b));
 	    }
 	    else {
 			cs.setEnd(cs.getStart());
 			cs.setSpanId((short) -1);
 			cs.setPayloads(null);
     	}
 	}

 	@Override
 	public boolean skipTo(int target) throws IOException {
 		if (hasMoreSpans && (firstSpans.doc() < target)){
   			if (!firstSpans.skipTo(target)){
   				candidateList.clear();
   				return false;
   			}
   		}
 		setCandidateList();
 		matchPayload.clear();
 		isStartEnumeration=false;
 		return advance();
 	}

 	@Override
 	public long cost() {
 		return termSpans.cost();
 	}

 	/** Match candidate for element spans.
 	 * */
 	class CandidateElementSpans extends CandidateSpans {

 		private short elementRef;

 		public CandidateElementSpans(Spans span, short elementRef)
 				throws IOException {
 			super(span);
 			setSpanId(elementRef);
 		}

 		public void setSpanId(short elementRef) {
 			this.elementRef = elementRef;
 		}
 		public short getSpanId() {
 			return elementRef;
 		}
 	}
 };
	package de.ids_mannheim.korap.query.spans;

	import java.io.IOException;
	import java.nio.ByteBuffer;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.List;
	import java.util.Map;

	import org.apache.lucene.index.AtomicReaderContext;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.index.TermContext;
	import org.apache.lucene.search.spans.Spans;
	import org.apache.lucene.search.spans.TermSpans;
	import org.apache.lucene.util.Bits;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import de.ids_mannheim.korap.query.SpanElementQuery;

	/**
	* @author Nils Diewald, margaretha
	*
	* Use copyFrom instead of clone
	*/
	public class ElementSpans extends SpansWithId {

	private List<CandidateElementSpans> candidateList;
	private int currentDoc, currentPosition;
	private TermSpans termSpans;

	private Logger logger = LoggerFactory.getLogger(ElementSpans.class);

	public ElementSpans(SpanElementQuery spanElementQuery,
	AtomicReaderContext context, Bits acceptDocs,
	Map<Term, TermContext> termContexts) throws IOException {
	super(spanElementQuery, context, acceptDocs, termContexts);
	candidateList = new ArrayList<>();
	termSpans = (TermSpans) firstSpans;
	hasMoreSpans = termSpans.next();
	if (hasMoreSpans) {
	currentDoc = termSpans.doc();
	currentPosition = termSpans.start();
	}
	}

	@Override
	public boolean next() throws IOException {
	isStartEnumeration=false;
	return advance();
	}

	/** Get the next match by first checking the candidate match list
	* and setting the list when it is empty.
	* */
	private boolean advance() throws IOException {
	while(hasMoreSpans \|\| !candidateList.isEmpty()){
	if (!candidateList.isEmpty()){
	CandidateElementSpans cs = candidateList.get(0);
	this.matchDocNumber = cs.getDoc();
	this.matchStartPosition = cs.getStart();
	this.matchEndPosition = cs.getEnd();
	this.matchPayload = cs.getPayloads();
	//this.setElementRef(cs.getSpanId());
	this.setSpanId(cs.getSpanId());
	candidateList.remove(0);
	return true;
	}
	else{
	//logger.info("Setting candidate list");
	setCandidateList();
	currentDoc = termSpans.doc();
	currentPosition = termSpans.start();
	}
	}
	return false;
	}

	/** Collect all the elements in the same start position and sort them by
	* end position (smallest first).
	* */
	private void setCandidateList() throws IOException {
	while (hasMoreSpans && termSpans.doc() == currentDoc &&
	termSpans.start() == currentPosition){
	CandidateElementSpans cs = new CandidateElementSpans(termSpans,
	spanId);
	//elementRef);
	readPayload(cs);
	candidateList.add(cs);
	hasMoreSpans = termSpans.next();
	}
	Collections.sort(candidateList);
	}


	/** This method reads the payload of the termSpan and assigns the end
	* position and element ref to the candidate match. The character offset
	* payload is set as the candidate match payload.
	* <br/><br/>
	* <em>Note</em>: payloadbuffer should actually collects all other payload
	* beside end position and element ref, but KorapIndex identify element's
	* payload by its length (8), which is only the character offsets. So
	* these offsets are directly set as the candidate match payload.
	*
	* @author margaretha
	* */
	private void readPayload(CandidateElementSpans cs) throws IOException {
	List<byte[]> payload = (List<byte[]>) termSpans.getPayload();
	int length = payload.get(0).length;
	ByteBuffer bb = ByteBuffer.allocate(length);
	bb.put(payload.get(0));

	if (!payload.isEmpty()) {
	// set element end position from payload
	cs.setEnd(bb.getInt(8));

	if (hasSpanId){ // copy element id
	cs.setSpanId(bb.getShort(12));
	}
	else{ // set element id -1
	cs.setSpanId((short) -1);
	}
	// Copy the start and end character offsets
	byte[] b = new byte[8];
	b = Arrays.copyOfRange(bb.array(), 0, 8);
	cs.setPayloads(Collections.singletonList(b));
	}
	else {
	cs.setEnd(cs.getStart());
	cs.setSpanId((short) -1);
	cs.setPayloads(null);
	}
	}

	@Override
	public boolean skipTo(int target) throws IOException {
	if (hasMoreSpans && (firstSpans.doc() < target)){
	if (!firstSpans.skipTo(target)){
	candidateList.clear();
	return false;
	}
	}
	setCandidateList();
	matchPayload.clear();
	isStartEnumeration=false;
	return advance();
	}

	@Override
	public long cost() {
	return termSpans.cost();
	}

	/** Match candidate for element spans.
	* */
	class CandidateElementSpans extends CandidateSpans {

	private short elementRef;

	public CandidateElementSpans(Spans span, short elementRef)
	throws IOException {
	super(span);
	setSpanId(elementRef);
	}

	public void setSpanId(short elementRef) {
	this.elementRef = elementRef;
	}
	public short getSpanId() {
	return elementRef;
	}
	}
	};