blob: 705450d1531a051ac2ef257b707b6b0f8a514ec2 [file] [log] [blame]
package de.ids_mannheim.korap.query.spans;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
import de.ids_mannheim.korap.query.SpanDistanceQuery;
/**
* Enumeration of span matches, whose two child spans have a specific
* range of
* distance (within a minimum and a maximum distance) and can occur in
* any
* order.
*
* @author margaretha
* */
public abstract class UnorderedDistanceSpans extends DistanceSpans {
protected int minDistance, maxDistance;
protected boolean hasMoreFirstSpans, hasMoreSecondSpans;
protected List<CandidateSpan> firstSpanList, secondSpanList;
protected List<CandidateSpan> matchList;
private long matchCost;
private int matchListSpanNum;
protected int currentDocNum;
/**
* Constructs UnorderedDistanceSpans for the given
* {@link SpanDistanceQuery} .
*
* @param query
* a SpanDistanceQuery
* @param context
* @param acceptDocs
* @param termContexts
* @throws IOException
*/
public UnorderedDistanceSpans (SpanDistanceQuery query,
LeafReaderContext context,
Bits acceptDocs,
Map<Term, TermContext> termContexts)
throws IOException {
super(query, context, acceptDocs, termContexts);
minDistance = query.getMinDistance();
maxDistance = query.getMaxDistance();
firstSpanList = new ArrayList<CandidateSpan>();
secondSpanList = new ArrayList<CandidateSpan>();
matchList = new ArrayList<CandidateSpan>();
hasMoreFirstSpans = firstSpans.next();
hasMoreSecondSpans = secondSpans.next();
hasMoreSpans = hasMoreFirstSpans && hasMoreSecondSpans;
}
@Override
protected boolean advance () throws IOException {
while (hasMoreSpans || !matchList.isEmpty()) {
if (!matchList.isEmpty()) {
setMatchProperties();
return true;
}
if (prepareLists())
setMatchList();
}
return false;
}
/**
* Updates the firstSpanList and secondSpanList by adding the next
* possible
* first and second spans. Both the spans must be in the same
* document. In
* UnorderedElementDistanceSpans, a span that is not in an element
* (distance
* unit), is not added to its candidate list. The element must
* also be in
* the same document.
*
* @return <code>true</code> if at least one of the candidate
* lists can be
* filled, <code>false</code> otherwise.
* @throws IOException
*/
protected abstract boolean prepareLists () throws IOException;
/**
* Sets the list of matches for the span having the smallest
* position (i.e. between the first and the second spans), and its
* candidates (i.e. its counterparts). The candidates also must
* have smaller positions. Simply remove the span if it does not
* have any candidates.
*
* @throws IOException
*/
protected void setMatchList () throws IOException {
hasMoreFirstSpans = setCandidateList(firstSpanList, firstSpans,
hasMoreFirstSpans, secondSpanList);
hasMoreSecondSpans = setCandidateList(secondSpanList, secondSpans,
hasMoreSecondSpans, firstSpanList);
// System.out.println("--------------------");
// System.out.println("firstSpanList:");
// for (CandidateSpan cs : firstSpanList) {
// System.out.println(cs.getStart() + " " + cs.getEnd());
// }
//
// System.out.println("secondSpanList:");
// for (CandidateSpan cs : secondSpanList) {
// System.out.println(cs.getStart() + " " + cs.getEnd());
// }
CandidateSpan currentFirstSpan, currentSecondSpan;
if (!firstSpanList.isEmpty() && !secondSpanList.isEmpty()) {
currentFirstSpan = firstSpanList.get(0);
currentSecondSpan = secondSpanList.get(0);
if (currentFirstSpan.getStart() < currentSecondSpan.getStart()
|| isLastCandidateSmaller(currentFirstSpan,
currentSecondSpan)) {
// log.trace("current target: "
// + firstSpanList.get(0).getStart() + " "
// + firstSpanList.get(0).getEnd());
// System.out.println("candidates:");
// for (CandidateSpan cs: secondSpanList) {
// System.out.println(cs.getStart() +" "+ cs.getEnd());
// }
matchList = findMatches(currentFirstSpan, secondSpanList);
setMatchFirstSpan(currentFirstSpan);
matchListSpanNum = 2;
updateList(firstSpanList);
}
else {
// log.trace("current target: "
// + secondSpanList.get(0).getStart() + " "
// + secondSpanList.get(0).getEnd());
// System.out.println("candidates:");
// for (CandidateSpan cs: firstSpanList) {
// System.out.println(cs.getStart() +" "+ cs.getEnd());
// }
matchList = findMatches(currentSecondSpan, firstSpanList);
setMatchSecondSpan(currentSecondSpan);
matchListSpanNum = 1;
updateList(secondSpanList);
}
}
else if (firstSpanList.isEmpty()) {
// log.trace("current target: " + secondSpanList.get(0).getStart()
// + " " + secondSpanList.get(0).getEnd());
// log.trace("candidates: empty");
updateList(secondSpanList);
}
else {
// log.trace("current target: " + firstSpanList.get(0).getStart()
// + " " + firstSpanList.get(0).getEnd());
// log.trace("candidates: empty");
updateList(firstSpanList);
}
}
/**
* Tells if the last candidate from the secondSpanList has a
* smaller end position than the end position of the the last
* candidate from the firstSpanList.
*
* @param currentFirstSpan
* the current firstspan
* @param currentSecondSpan
* the current secondspan
* @return <code>true</code> if the end position of the last
* candidate from the secondSpanList is smaller than that
* from the firstSpanList, <code>false</code> otherwise.
*/
private boolean isLastCandidateSmaller (CandidateSpan currentFirstSpan,
CandidateSpan currentSecondSpan) {
if (currentFirstSpan.getEnd() == currentSecondSpan.getEnd()) {
int secondEnd = secondSpanList.get(secondSpanList.size() - 1)
.getEnd();
int firstEnd = firstSpanList.get(firstSpanList.size() - 1).getEnd();
return (secondEnd < firstEnd ? true : false);
}
return false;
}
/**
* Performs an update based on the given candidateList. In
* {@link UnorderedTokenDistanceSpans}, the first candidate in the
* candidateList is simply removed. In
* {@link UnorderedElementDistanceSpans} , the elementList is also
* updated.
*
* @param candidateList
* a candidateList
*/
protected abstract void updateList (List<CandidateSpan> candidateList);
/**
* Sets the candidate list for the first element in the target
* list and
* tells if the the specified spans has finished or not.
*
* @param candidateList
* a list of candidate spans
* @param candidate
* a Spans
* @param hasMoreCandidates
* a boolean
* @param targetList
* a list of target spans
* @return <code>true</code> if the span enumeration still has a
* next
* element to be a candidate, <code>false</code>
* otherwise.
* @throws IOException
*/
protected abstract boolean setCandidateList (
List<CandidateSpan> candidateList, Spans candidate,
boolean hasMoreCandidates, List<CandidateSpan> targetList)
throws IOException;
/**
* Finds all matches between the target span and its candidates in
* the
* candidate list.
*
* @param target
* a target span
* @param candidateList
* a candidate list
* @return the matches in a list
*/
protected abstract List<CandidateSpan> findMatches (CandidateSpan target,
List<CandidateSpan> candidateList);
/**
* Computes match properties and creates a candidate span match to
* be added
* to the match list.
*
* @return a candidate span match
* */
protected CandidateSpan createMatchCandidate (CandidateSpan target,
CandidateSpan cs, boolean isDistanceZero) {
int start = Math.min(target.getStart(), cs.getStart());
int end = Math.max(target.getEnd(), cs.getEnd());
int doc = target.getDoc();
long cost = target.getCost() + cs.getCost();
Collection<byte[]> payloads = new LinkedList<byte[]>();
if (collectPayloads) {
if (target.getPayloads() != null) {
payloads.addAll(target.getPayloads());
}
if (cs.getPayloads() != null) {
payloads.addAll(cs.getPayloads());
}
}
CandidateSpan match = new CandidateSpan(start, end, doc, cost, payloads);
match.setChildSpan(cs);
return match;
}
/**
* Assigns the first candidate span in the match list as the
* current span
* match, and removes it from the matchList.
* */
private void setMatchProperties () {
CandidateSpan cs = matchList.get(0);
matchDocNumber = cs.getDoc();
matchStartPosition = cs.getStart();
matchEndPosition = cs.getEnd();
matchCost = cs.getCost();
matchPayload.addAll(cs.getPayloads());
matchList.remove(0);
if (matchListSpanNum == 1)
setMatchFirstSpan(cs.getChildSpan());
else
setMatchSecondSpan(cs.getChildSpan());
// log.trace("Match doc#={} start={} end={}", matchDocNumber,
// matchStartPosition, matchEndPosition);
// log.trace("firstspan " + getMatchFirstSpan().getStart() + " "
// + getMatchFirstSpan().getEnd());
// log.trace("secondspan " + getMatchSecondSpan().getStart() + " "
// + getMatchSecondSpan().getEnd());
}
@Override
public boolean skipTo (int target) throws IOException {
if (hasMoreSpans && (secondSpans.doc() < target)) {
if (!secondSpans.skipTo(target)) {
hasMoreSpans = false;
return false;
}
}
firstSpanList.clear();
secondSpanList.clear();
matchPayload.clear();
isStartEnumeration = false;
return advance();
}
@Override
public long cost () {
return matchCost;
}
}