blob: 6019125fef5c76346aeee138ebca4b1ab8f1c7a8 [file] [log] [blame]
package de.ids_mannheim.korap.query.spans;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.util.Bits;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import de.ids_mannheim.korap.query.SpanRepetitionQuery;
/** Enumeration of spans occurring multiple times in a sequence.
* The number of min and max repetition can be set.
*
* @author margaretha
* */
public class RepetitionSpans extends SimpleSpans{
private int min,max;
private long matchCost;
private List<CandidateSpan> matchList;
private Logger log = LoggerFactory.getLogger(RepetitionSpans.class);
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
public RepetitionSpans(SpanRepetitionQuery query,
AtomicReaderContext context, Bits acceptDocs,
Map<Term, TermContext> termContexts)
throws IOException {
super(query, context, acceptDocs, termContexts);
this.min = query.getMin();
this.max = query.getMax();
matchList = new ArrayList<CandidateSpan>();
hasMoreSpans = firstSpans.next();
}
@Override
public boolean next() throws IOException {
isStartEnumeration = false;
matchPayload.clear();
return advance();
}
/** Get the next span from the candidate match list, or set it first when
* it is empty.
* */
private boolean advance() throws IOException {
while (hasMoreSpans || !matchList.isEmpty()){
if (!matchList.isEmpty()){
setMatchProperties(matchList.get(0));
matchList.remove(0);
return true;
}
matchCost = 0;
List<CandidateSpan> adjacentSpans = collectAdjacentSpans();
setMatchList(adjacentSpans);
}
return false;
}
/** Collect all adjacent spans occurring in a sequence.
* @return a list of the adjacent spans
* */
private List<CandidateSpan> collectAdjacentSpans() throws IOException {
CandidateSpan startSpan = new CandidateSpan(firstSpans);
List<CandidateSpan> adjacentSpans = new ArrayList<CandidateSpan>();
adjacentSpans.add(startSpan);
CandidateSpan prevSpan = startSpan;
while ((hasMoreSpans = firstSpans.next()) &&
startSpan.getDoc() == firstSpans.doc() ){
if (firstSpans.start() > prevSpan.getEnd()){
break;
}
else if (firstSpans.start() == prevSpan.getEnd()){
prevSpan = new CandidateSpan(firstSpans);
adjacentSpans.add(prevSpan);
}
}
return adjacentSpans;
}
/** Generate all possible repetition candidate spans from the adjacent spans
* and add them to the match list.
* */
private void setMatchList(List<CandidateSpan> adjacentSpans){
CandidateSpan startSpan, endSpan, matchSpan;
for (int i=min; i<max+1; i++){
//System.out.println("num: "+i);
int j=0;
int endIndex;
while ((endIndex = j+i-1) < adjacentSpans.size()){
startSpan = adjacentSpans.get(j);
if (i == 1){
try {
matchSpan = startSpan.clone();
matchSpan.setPayloads(computeMatchPayload(adjacentSpans, 0, endIndex-1));
matchList.add(matchSpan);
} catch (CloneNotSupportedException e) {
e.printStackTrace();
}
}
else {
endSpan = adjacentSpans.get(endIndex);
matchSpan = new CandidateSpan(
startSpan.getStart(),
endSpan.getEnd(),
startSpan.getDoc(),
computeMatchCost(adjacentSpans, 0, endIndex),
computeMatchPayload(adjacentSpans, 0, endIndex));
//System.out.println("c:"+matchSpan.getCost() +" p:"+ matchSpan.getPayloads().size());
//System.out.println(startSpan.getStart() +","+endSpan.getEnd());
matchList.add(matchSpan);
}
j++;
}
}
Collections.sort(matchList);
}
/** Add all the payloads of a candidate span
* */
private Collection<byte[]> computeMatchPayload(
List<CandidateSpan> adjacentSpans, int start, int end) {
Collection<byte[]> payload = new ArrayList<byte[]>();
for (int i=start; i<= end; i++){
payload.addAll(adjacentSpans.get(i).getPayloads());
}
return payload;
}
/** Add all the cost of a candidate span
* */
private long computeMatchCost(List<CandidateSpan> adjacentSpans,
int start, int end){
long matchCost = 0;
for (int i=start; i<= end; i++){
CandidateSpan c = adjacentSpans.get(i);
matchCost += adjacentSpans.get(i).getCost();
}
return matchCost;
}
/** Setting match properties from the candidate span
* */
private void setMatchProperties(CandidateSpan candidateSpan)
throws IOException {
matchDocNumber = candidateSpan.getDoc();
matchStartPosition = candidateSpan.getStart();
matchEndPosition = candidateSpan.getEnd();
if (collectPayloads && candidateSpan.getPayloads() != null) {
matchPayload.addAll(candidateSpan.getPayloads());
}
if (DEBUG)
log.trace("doc# {}, start {}, end {}",matchDocNumber,matchStartPosition,
matchEndPosition);
}
@Override
public boolean skipTo(int target) throws IOException {
if (hasMoreSpans && firstSpans.doc() < target){
if (!firstSpans.skipTo(target)){
hasMoreSpans = false;
return false;
}
}
matchList.clear();
return advance();
}
@Override
public long cost() {
return matchCost;
}
}