blob: a9aa08b836b7b63c5c083e9e32ec543484318057 [file] [log] [blame]
package de.ids_mannheim.korap.query.spans;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.TermSpans;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import de.ids_mannheim.korap.query.SpanElementQuery;
/**
* @author Nils Diewald, margaretha
*
* Use copyFrom instead of clone
*/
public class ElementSpans extends SimpleSpans {
private List<CandidateElementSpans> candidateList;
private int currentDoc, currentPosition;
private short elementRef;
private TermSpans termSpans;
public boolean isElementRef = false; // A dummy flag
protected Logger logger = LoggerFactory.getLogger(AttributeSpans.class);
public ElementSpans(SpanElementQuery spanElementQuery,
AtomicReaderContext context, Bits acceptDocs,
Map<Term, TermContext> termContexts) throws IOException {
super(spanElementQuery, context, acceptDocs, termContexts);
candidateList = new ArrayList<>();
termSpans = (TermSpans) firstSpans;
hasMoreSpans = termSpans.next();
if (hasMoreSpans) {
currentDoc = termSpans.doc();
currentPosition = termSpans.start();
}
}
@Override
public boolean next() throws IOException {
isStartEnumeration=false;
return advance();
}
/** Get the next match by first checking the candidate match list
* and setting the list when it is empty.
* */
private boolean advance() throws IOException {
while(hasMoreSpans || !candidateList.isEmpty()){
if (!candidateList.isEmpty()){
CandidateElementSpans cs = candidateList.get(0);
this.matchDocNumber = cs.getDoc();
this.matchStartPosition = cs.getStart();
this.matchEndPosition = cs.getEnd();
this.matchPayload = cs.getPayloads();
this.setElementRef(cs.getElementRef());
candidateList.remove(0);
return true;
}
else{
logger.info("Setting candidate list");
setCandidateList();
currentDoc = termSpans.doc();
currentPosition = termSpans.start();
}
}
return false;
}
/** Collect all the elements in the same start position and sort them by
* end position (smallest first).
* */
private void setCandidateList() throws IOException {
while (hasMoreSpans && termSpans.doc() == currentDoc &&
termSpans.start() == currentPosition){
CandidateElementSpans cs = new CandidateElementSpans(termSpans,
elementRef);
readPayload(cs);
candidateList.add(cs);
hasMoreSpans = termSpans.next();
}
Collections.sort(candidateList);
}
/** This method reads the payload of the termSpan and assigns the end
* position and element ref to the candidate match. The character offset
* payload is set as the candidate match payload.
* <br/><br/>
* <em>Note</em>: payloadbuffer should actually collects all other payload
* beside end position and element ref, but KorapIndex identify element's
* payload by its length (8), which is only the character offsets. So
* these offsets are directly set as the candidate match payload.
*
* @author margaretha
* */
private void readPayload(CandidateElementSpans cs) throws IOException {
BytesRef payload = termSpans.getPostings().getPayload();
//ByteBuffer payloadBuffer = ByteBuffer.allocate(128);
if (payload != null) {
// Copy some payloads like start character and end character
//payloadBuffer.put(payload.bytes, payload.offset, 8);
cs.setEnd(readEndPostion(payload));
if (isElementRef ){
// Copy rest of payloads after the end position and elementref
//payloadBuffer.put(payload.bytes, payload.offset + 14, payload.length - 14);
cs.setElementRef(readElementRef(payload));
}
else{
// Copy rest of payloads after the end position
//payloadBuffer.put(payload.bytes, payload.offset + 12, payload.length - 12);
cs.setElementRef((short) -1);
}
//byte[] offsetCharacters = new byte[8];
//System.arraycopy(payloadBuffer.array(), 0, offsetCharacters, 0, 8);
cs.setPayloads(Collections.singletonList(readOffset(payload)));
}
else {
cs.setEnd(cs.getStart());
cs.setElementRef((short) -1);
cs.setPayloads(null);
}
}
/** Get the offset bytes from the payload.
* */
private byte[] readOffset(BytesRef payload){
byte[] b = new byte[8];
System.arraycopy(payload.bytes, payload.offset, b, 0, 8);
return b;
}
/** Get the end position bytes from the payload and cast it to int.
* */
private int readEndPostion(BytesRef payload) {
byte[] b = new byte[4];
System.arraycopy(payload.bytes, payload.offset + 8, b, 0, 4);
return ByteBuffer.wrap(b).getInt();
}
/** Get the elementRef bytes from the payload and cast it into short.
* */
private short readElementRef(BytesRef payload) {
byte[] b = new byte[2];
System.arraycopy(payload.bytes, payload.offset + 12, b, 0, 2);
return ByteBuffer.wrap(b).getShort();
}
@Override
public boolean skipTo(int target) throws IOException {
if (hasMoreSpans && (firstSpans.doc() < target)){
if (!firstSpans.skipTo(target)){
candidateList.clear();
return false;
}
}
setCandidateList();
matchPayload.clear();
isStartEnumeration=false;
return advance();
}
@Override
public long cost() {
return termSpans.cost();
}
public short getElementRef() {
return elementRef;
}
public void setElementRef(short elementRef) {
this.elementRef = elementRef;
}
/** Match candidate for element spans.
* */
class CandidateElementSpans extends CandidateSpan {
private short elementRef;
public CandidateElementSpans(Spans span, short elementRef)
throws IOException {
super(span);
setElementRef(elementRef);
}
public void setElementRef(short elementRef) {
this.elementRef = elementRef;
}
public short getElementRef() {
return elementRef;
}
}
};