Improved fix in NextSpans so the candidatelist is recreated, in case the matchlist is empty
Change-Id: I69778b018458f235945dc64b4bf0601269a45f06
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
index 9f71154..8a22191 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
@@ -85,12 +85,22 @@
|| !candidateList.isEmpty()) {
// Check, if the matchlist is fine
- // It may be enough to clear it though
- while (!matchList.isEmpty() && matchList.get(0).getDoc() != firstSpans.doc()) {
- matchList.remove(0);
+ if (!matchList.isEmpty() &&
+ candidateListDocNum != firstSpans.doc()) {
+
if (DEBUG) {
- log.debug("Remove first entry from matchlist because it's not in the same doc");
+ log.debug(
+ "Remove entries from matchlist because " +
+ "it's not in the same doc {}!={}",
+ firstSpans.doc(),
+ candidateListDocNum);
};
+
+ // Clear matchList
+ matchList.clear();
+
+ // Set new matchlist
+ setMatchList();
};
if (!matchList.isEmpty()) {
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/RepetitionSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/RepetitionSpans.java
index 22314e4..7e57a80 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/RepetitionSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/RepetitionSpans.java
@@ -86,13 +86,17 @@
while (hasMoreSpans || !matchList.isEmpty()) {
if (!matchList.isEmpty()) {
+
+ // Take the first element of the matchlist
setMatchProperties(matchList.get(0));
matchList.remove(0);
+
return true;
}
matchCost = 0;
List<CandidateSpan> adjacentSpans = collectAdjacentSpans();
+
setMatchList(adjacentSpans);
}
return false;
@@ -135,23 +139,30 @@
i++;
}
while ((hasMoreSpans = firstSpans.next())
- && startSpan.getDoc() == firstSpans.doc()) {
+ && startSpan.getDoc() == firstSpans.doc()) {
if (DEBUG) {
- log.debug("Check adjacency at {}-{}|{}-{} in {}",
- prevSpan.getStart(), prevSpan.getEnd(),
- firstSpans.start(), firstSpans.end(),
- startSpan.getDoc());
+ log.debug("Check adjacency of rep-spans at {}-{}|{}-{} in {}={}={}",
+ prevSpan.getStart(), prevSpan.getEnd(),
+ firstSpans.start(), firstSpans.end(),
+ startSpan.getDoc(), firstSpans.doc(), prevSpan.getDoc());
};
if (firstSpans.start() > prevSpan.getEnd()) {
candidates.add(new CandidateSpan(firstSpans));
break;
}
+
+ /*
+ * ND: This seems to be suboptimal, in cases of searching
+ * for "ab{2,3}c" and a match like "abbbbbbbbbbbbbbbbbbbbbbbbbbc".
+ */
else if (firstSpans.start() == prevSpan.getEnd()) {
prevSpan = new CandidateSpan(firstSpans);
adjacentSpans.add(prevSpan);
}
+
+ // firstSpan.start() < prevSpan.getEnd()
else {
candidates.add(new CandidateSpan(firstSpans));
}
@@ -169,6 +180,7 @@
*/
private void setMatchList (List<CandidateSpan> adjacentSpans) {
CandidateSpan startSpan, endSpan, matchSpan;
+
for (int i = min; i < max + 1; i++) {
int j = 0;
int endIndex;
@@ -180,6 +192,14 @@
matchSpan = startSpan.clone();
matchSpan.setPayloads(computeMatchPayload(adjacentSpans,
0, endIndex - 1));
+
+ if (DEBUG) {
+ log.debug("1. Add span to matchlist: {}-{} at {}",
+ matchSpan.getStart(),
+ matchSpan.getEnd(),
+ matchSpan.getDoc());
+ };
+
matchList.add(matchSpan);
}
catch (CloneNotSupportedException e) {
@@ -188,21 +208,36 @@
}
else {
endSpan = adjacentSpans.get(endIndex);
- matchSpan = new CandidateSpan(startSpan.getStart(),
- endSpan.getEnd(), startSpan.getDoc(),
- computeMatchCost(adjacentSpans, 0, endIndex),
- computeMatchPayload(adjacentSpans, 0, endIndex));
+ matchSpan = new CandidateSpan(
+ startSpan.getStart(),
+ endSpan.getEnd(),
+ startSpan.getDoc(),
+ computeMatchCost(adjacentSpans, 0, endIndex),
+ computeMatchPayload(adjacentSpans, 0, endIndex)
+ );
//System.out.println("c:"+matchSpan.getCost() +" p:"+ matchSpan.getPayloads().size());
//System.out.println(startSpan.getStart() +","+endSpan.getEnd());
+
+ if (DEBUG) {
+ log.debug("2. Add span to matchlist: {}-{} at {}={}",
+ matchSpan.getStart(),
+ matchSpan.getEnd(),
+ matchSpan.getDoc(),
+ endSpan.getDoc());
+ };
+
matchList.add(matchSpan);
}
j++;
}
+ /*
if (j + i == adjacentSpans.size()) {
}
+ */
}
+
Collections.sort(matchList);
}
@@ -269,6 +304,14 @@
*/
private void setMatchProperties (CandidateSpan candidateSpan)
throws IOException {
+
+ if (DEBUG) {
+ log.debug("Set match properties to {}-{} at {}",
+ candidateSpan.getStart(),
+ candidateSpan.getEnd(),
+ candidateSpan.getDoc()
+ );
+ };
matchDocNumber = candidateSpan.getDoc();
matchStartPosition = candidateSpan.getStart();
matchEndPosition = candidateSpan.getEnd();
@@ -280,6 +323,12 @@
@Override
public boolean skipTo (int target) throws IOException {
+ if (DEBUG) {
+ log.debug("Skip repetitionSpans to {}", target);
+ };
+ matchDocNumber = -1;
+ matchStartPosition = -1;
+ matchEndPosition = -1;
if (!candidates.isEmpty()) {
Iterator<CandidateSpan> i = candidates.iterator();
while (i.hasNext()) {
@@ -298,6 +347,13 @@
hasMoreSpans = false;
return false;
}
+ if (DEBUG) {
+ log.debug("Skip firstSpans to {}={} succeed with positions {}-{}",
+ target,
+ firstSpans.doc(),
+ firstSpans.start(),
+ firstSpans.end());
+ };
}
matchList.clear();
return advance();