src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java - KorAP/Krill - Gitiles

 package de.ids_mannheim.korap.query.spans;

 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;

 import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermContext;
 import org.apache.lucene.search.spans.Spans;
 import org.apache.lucene.search.spans.TermSpans;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import de.ids_mannheim.korap.query.SpanElementQuery;

 /**
  * @author Nils Diewald, margaretha
  *
  * Use copyFrom instead of clone
  */
 public class ElementSpans extends SimpleSpans {

 	private List<CandidateElementSpans> candidateList;
 	private int currentDoc, currentPosition;
 	private short elementRef;
 	private TermSpans termSpans;

 	public boolean isElementRef = false; // A dummy flag

 	protected Logger logger = LoggerFactory.getLogger(AttributeSpans.class);

 	public ElementSpans(SpanElementQuery spanElementQuery,
 			AtomicReaderContext context, Bits acceptDocs,
 			Map<Term, TermContext> termContexts) throws IOException {
 		super(spanElementQuery, context, acceptDocs, termContexts);
 		candidateList = new ArrayList<>();
 		termSpans = (TermSpans) firstSpans;
 		hasMoreSpans = termSpans.next();
 		if (hasMoreSpans) {
 			currentDoc = termSpans.doc();
 			currentPosition = termSpans.start();
 		}
 	}

 	@Override
 	public boolean next() throws IOException {
 		isStartEnumeration=false;
 		return advance();
 	}

 	/**	Get the next match by first checking the candidate match list
 	 * 	and setting the list when it is empty.
 	 * */
 	private boolean advance() throws IOException {
 		while(hasMoreSpans || !candidateList.isEmpty()){
 			if (!candidateList.isEmpty()){
 				CandidateElementSpans cs = candidateList.get(0);
 				this.matchDocNumber = cs.getDoc();
 				this.matchStartPosition = cs.getStart();
 				this.matchEndPosition = cs.getEnd();
 				this.matchPayload = cs.getPayloads();
 				this.setElementRef(cs.getElementRef());
 				candidateList.remove(0);
 				return true;
 			}
 			else{
 				logger.info("Setting candidate list");
 				setCandidateList();
 				currentDoc = termSpans.doc();
 				currentPosition = termSpans.start();
 			}
 		}
 		return false;
 	}

 	/**	Collect all the elements in the same start position and sort them by
 	 * 	end position (smallest first).
 	 * */
 	private void setCandidateList() throws IOException {
 		while (hasMoreSpans &&	termSpans.doc() == currentDoc &&
 				termSpans.start() == currentPosition){
 			CandidateElementSpans cs = new CandidateElementSpans(termSpans,
 					elementRef);
 			readPayload(cs);
 			candidateList.add(cs);
 			hasMoreSpans = termSpans.next();
 		}
 		Collections.sort(candidateList);
 	}


 	/**	This method reads the payload of the termSpan and assigns the end
 	 * 	position and element ref to the candidate match. The character offset
 	 *  payload is set as the candidate match payload.
 	 *  <br/><br/>
 	 * 	<em>Note</em>: payloadbuffer should actually collects all other payload
 	 * 	beside end position and element ref, but KorapIndex identify element's
 	 * 	payload by its length (8), which is only the character offsets. So
 	 * 	these offsets are directly set as the candidate match payload.
 	 *
 	 * 	@author margaretha
 	 * */
 	private void readPayload(CandidateElementSpans cs) throws IOException {
 	    BytesRef payload = termSpans.getPostings().getPayload();
 	    //ByteBuffer payloadBuffer = ByteBuffer.allocate(128);

 	    if (payload != null) {
 			// Copy some payloads like start character and end character
 	    	//payloadBuffer.put(payload.bytes, payload.offset, 8);

 			cs.setEnd(readEndPostion(payload));

 			if (isElementRef ){
 				// Copy rest of payloads after the end position and elementref
 				//payloadBuffer.put(payload.bytes, payload.offset + 14, payload.length - 14);
 				cs.setElementRef(readElementRef(payload));
 			}
 			else{
 				// Copy rest of payloads after the end position
 				//payloadBuffer.put(payload.bytes, payload.offset + 12, payload.length - 12);
 				cs.setElementRef((short) -1);
 			}

 			//byte[] offsetCharacters = new byte[8];
 			//System.arraycopy(payloadBuffer.array(), 0, offsetCharacters, 0, 8);

 			cs.setPayloads(Collections.singletonList(readOffset(payload)));
 	    }
 	    else {
 			cs.setEnd(cs.getStart());
 			cs.setElementRef((short) -1);
 			cs.setPayloads(null);
     	}
 	}


 	/**	Get the offset bytes from the payload.
 	 * */
 	private byte[] readOffset(BytesRef payload){
 		byte[] b = new byte[8];
 		System.arraycopy(payload.bytes, payload.offset, b, 0, 8);
 		return b;
 	}

 	/**	Get the end position bytes from the payload and cast it to int.
 	 * */
 	private int readEndPostion(BytesRef payload) {
 		byte[] b = new byte[4];
 		System.arraycopy(payload.bytes, payload.offset + 8, b, 0, 4);
 		return ByteBuffer.wrap(b).getInt();
 	}

 	/**	Get the elementRef bytes from the payload and cast it into short.
 	 * */
 	private short readElementRef(BytesRef payload) {
     	byte[] b = new byte[2];
     	System.arraycopy(payload.bytes, payload.offset + 12, b, 0, 2);
     	return ByteBuffer.wrap(b).getShort();
 	}

 	@Override
 	public boolean skipTo(int target) throws IOException {
 		if (hasMoreSpans && (firstSpans.doc() < target)){
   			if (!firstSpans.skipTo(target)){
   				candidateList.clear();
   				return false;
   			}
   		}
 		setCandidateList();
 		matchPayload.clear();
 		isStartEnumeration=false;
 		return advance();
 	}

 	@Override
 	public long cost() {
 		return termSpans.cost();
 	}

 	public short getElementRef() {
 		return elementRef;
 	}

 	public void setElementRef(short elementRef) {
 		this.elementRef = elementRef;
 	}

 	/** Match candidate for element spans.
 	 * */
 	class CandidateElementSpans extends CandidateSpan {

 		private short elementRef;

 		public CandidateElementSpans(Spans span, short elementRef)
 				throws IOException {
 			super(span);
 			setElementRef(elementRef);
 		}

 		public void setElementRef(short elementRef) {
 			this.elementRef = elementRef;
 		}
 		public short getElementRef() {
 			return elementRef;
 		}
 	}
 };
	package de.ids_mannheim.korap.query.spans;

	import java.io.IOException;
	import java.nio.ByteBuffer;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.List;
	import java.util.Map;

	import org.apache.lucene.index.AtomicReaderContext;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.index.TermContext;
	import org.apache.lucene.search.spans.Spans;
	import org.apache.lucene.search.spans.TermSpans;
	import org.apache.lucene.util.Bits;
	import org.apache.lucene.util.BytesRef;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import de.ids_mannheim.korap.query.SpanElementQuery;

	/**
	* @author Nils Diewald, margaretha
	*
	* Use copyFrom instead of clone
	*/
	public class ElementSpans extends SimpleSpans {

	private List<CandidateElementSpans> candidateList;
	private int currentDoc, currentPosition;
	private short elementRef;
	private TermSpans termSpans;

	public boolean isElementRef = false; // A dummy flag

	protected Logger logger = LoggerFactory.getLogger(AttributeSpans.class);

	public ElementSpans(SpanElementQuery spanElementQuery,
	AtomicReaderContext context, Bits acceptDocs,
	Map<Term, TermContext> termContexts) throws IOException {
	super(spanElementQuery, context, acceptDocs, termContexts);
	candidateList = new ArrayList<>();
	termSpans = (TermSpans) firstSpans;
	hasMoreSpans = termSpans.next();
	if (hasMoreSpans) {
	currentDoc = termSpans.doc();
	currentPosition = termSpans.start();
	}
	}

	@Override
	public boolean next() throws IOException {
	isStartEnumeration=false;
	return advance();
	}

	/** Get the next match by first checking the candidate match list
	* and setting the list when it is empty.
	* */
	private boolean advance() throws IOException {
	while(hasMoreSpans \|\| !candidateList.isEmpty()){
	if (!candidateList.isEmpty()){
	CandidateElementSpans cs = candidateList.get(0);
	this.matchDocNumber = cs.getDoc();
	this.matchStartPosition = cs.getStart();
	this.matchEndPosition = cs.getEnd();
	this.matchPayload = cs.getPayloads();
	this.setElementRef(cs.getElementRef());
	candidateList.remove(0);
	return true;
	}
	else{
	logger.info("Setting candidate list");
	setCandidateList();
	currentDoc = termSpans.doc();
	currentPosition = termSpans.start();
	}
	}
	return false;
	}

	/** Collect all the elements in the same start position and sort them by
	* end position (smallest first).
	* */
	private void setCandidateList() throws IOException {
	while (hasMoreSpans && termSpans.doc() == currentDoc &&
	termSpans.start() == currentPosition){
	CandidateElementSpans cs = new CandidateElementSpans(termSpans,
	elementRef);
	readPayload(cs);
	candidateList.add(cs);
	hasMoreSpans = termSpans.next();
	}
	Collections.sort(candidateList);
	}


	/** This method reads the payload of the termSpan and assigns the end
	* position and element ref to the candidate match. The character offset
	* payload is set as the candidate match payload.
	* <br/><br/>
	* <em>Note</em>: payloadbuffer should actually collects all other payload
	* beside end position and element ref, but KorapIndex identify element's
	* payload by its length (8), which is only the character offsets. So
	* these offsets are directly set as the candidate match payload.
	*
	* @author margaretha
	* */
	private void readPayload(CandidateElementSpans cs) throws IOException {
	BytesRef payload = termSpans.getPostings().getPayload();
	//ByteBuffer payloadBuffer = ByteBuffer.allocate(128);

	if (payload != null) {
	// Copy some payloads like start character and end character
	//payloadBuffer.put(payload.bytes, payload.offset, 8);

	cs.setEnd(readEndPostion(payload));

	if (isElementRef ){
	// Copy rest of payloads after the end position and elementref
	//payloadBuffer.put(payload.bytes, payload.offset + 14, payload.length - 14);
	cs.setElementRef(readElementRef(payload));
	}
	else{
	// Copy rest of payloads after the end position
	//payloadBuffer.put(payload.bytes, payload.offset + 12, payload.length - 12);
	cs.setElementRef((short) -1);
	}

	//byte[] offsetCharacters = new byte[8];
	//System.arraycopy(payloadBuffer.array(), 0, offsetCharacters, 0, 8);

	cs.setPayloads(Collections.singletonList(readOffset(payload)));
	}
	else {
	cs.setEnd(cs.getStart());
	cs.setElementRef((short) -1);
	cs.setPayloads(null);
	}
	}


	/** Get the offset bytes from the payload.
	* */
	private byte[] readOffset(BytesRef payload){
	byte[] b = new byte[8];
	System.arraycopy(payload.bytes, payload.offset, b, 0, 8);
	return b;
	}

	/** Get the end position bytes from the payload and cast it to int.
	* */
	private int readEndPostion(BytesRef payload) {
	byte[] b = new byte[4];
	System.arraycopy(payload.bytes, payload.offset + 8, b, 0, 4);
	return ByteBuffer.wrap(b).getInt();
	}

	/** Get the elementRef bytes from the payload and cast it into short.
	* */
	private short readElementRef(BytesRef payload) {
	byte[] b = new byte[2];
	System.arraycopy(payload.bytes, payload.offset + 12, b, 0, 2);
	return ByteBuffer.wrap(b).getShort();
	}

	@Override
	public boolean skipTo(int target) throws IOException {
	if (hasMoreSpans && (firstSpans.doc() < target)){
	if (!firstSpans.skipTo(target)){
	candidateList.clear();
	return false;
	}
	}
	setCandidateList();
	matchPayload.clear();
	isStartEnumeration=false;
	return advance();
	}

	@Override
	public long cost() {
	return termSpans.cost();
	}

	public short getElementRef() {
	return elementRef;
	}

	public void setElementRef(short elementRef) {
	this.elementRef = elementRef;
	}

	/** Match candidate for element spans.
	* */
	class CandidateElementSpans extends CandidateSpan {

	private short elementRef;

	public CandidateElementSpans(Spans span, short elementRef)
	throws IOException {
	super(span);
	setElementRef(elementRef);
	}

	public void setElementRef(short elementRef) {
	this.elementRef = elementRef;
	}
	public short getElementRef() {
	return elementRef;
	}
	}
	};