blob: 60e2a8aa3b52fd9360e5780fabc74ceed83c6006 [file] [log] [blame]
Eliza Margaretha1751dbf2014-02-03 09:47:19 +00001package de.ids_mannheim.korap.query.spans;
2
3import java.io.IOException;
Eliza Margaretha9738c392014-02-03 17:04:53 +00004import java.util.ArrayList;
5import java.util.Collection;
6import java.util.LinkedList;
7import java.util.List;
Eliza Margaretha1751dbf2014-02-03 09:47:19 +00008import java.util.Map;
9
10import org.apache.lucene.index.AtomicReaderContext;
11import org.apache.lucene.index.Term;
12import org.apache.lucene.index.TermContext;
Eliza Margaretha9738c392014-02-03 17:04:53 +000013import org.apache.lucene.search.spans.Spans;
Eliza Margaretha1751dbf2014-02-03 09:47:19 +000014import org.apache.lucene.util.Bits;
15
16import de.ids_mannheim.korap.query.SpanDistanceQuery;
17
Eliza Margaretha609a5be2014-12-18 16:52:20 +000018/**
Nils Diewaldbb33da22015-03-04 16:24:25 +000019 * Enumeration of span matches, whose two child spans have a specific
20 * range of
21 * distance (within a minimum and a maximum distance) and can occur in
22 * any
Eliza Margaretha609a5be2014-12-18 16:52:20 +000023 * order.
24 *
25 * @author margaretha
Eliza Margaretha9738c392014-02-03 17:04:53 +000026 * */
Eliza Margaretha609a5be2014-12-18 16:52:20 +000027public abstract class UnorderedDistanceSpans extends DistanceSpans {
Eliza Margaretha1751dbf2014-02-03 09:47:19 +000028
Eliza Margaretha609a5be2014-12-18 16:52:20 +000029 protected int minDistance, maxDistance;
30 protected boolean hasMoreFirstSpans, hasMoreSecondSpans;
31 protected List<CandidateSpan> firstSpanList, secondSpanList;
32 protected List<CandidateSpan> matchList;
33 private long matchCost;
34 private int matchListSpanNum;
35 protected int currentDocNum;
Nils Diewald34eaa862014-06-03 10:56:27 +000036
Nils Diewaldbb33da22015-03-04 16:24:25 +000037
Eliza Margaretha609a5be2014-12-18 16:52:20 +000038 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +000039 * Constructs UnorderedDistanceSpans for the given
40 * {@link SpanDistanceQuery} .
Eliza Margaretha609a5be2014-12-18 16:52:20 +000041 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000042 * @param query
43 * a SpanDistanceQuery
Eliza Margaretha609a5be2014-12-18 16:52:20 +000044 * @param context
45 * @param acceptDocs
46 * @param termContexts
47 * @throws IOException
48 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000049 public UnorderedDistanceSpans (SpanDistanceQuery query,
50 AtomicReaderContext context,
51 Bits acceptDocs,
52 Map<Term, TermContext> termContexts)
53 throws IOException {
Eliza Margaretha609a5be2014-12-18 16:52:20 +000054 super(query, context, acceptDocs, termContexts);
55 minDistance = query.getMinDistance();
56 maxDistance = query.getMaxDistance();
Nils Diewald34eaa862014-06-03 10:56:27 +000057
Eliza Margaretha609a5be2014-12-18 16:52:20 +000058 firstSpanList = new ArrayList<CandidateSpan>();
59 secondSpanList = new ArrayList<CandidateSpan>();
60 matchList = new ArrayList<CandidateSpan>();
Eliza Margaretha1751dbf2014-02-03 09:47:19 +000061
Eliza Margaretha609a5be2014-12-18 16:52:20 +000062 hasMoreFirstSpans = firstSpans.next();
63 hasMoreSecondSpans = secondSpans.next();
64 hasMoreSpans = hasMoreFirstSpans && hasMoreSecondSpans;
65 }
Eliza Margaretha9738c392014-02-03 17:04:53 +000066
Nils Diewaldbb33da22015-03-04 16:24:25 +000067
Eliza Margaretha609a5be2014-12-18 16:52:20 +000068 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +000069 protected boolean advance () throws IOException {
Eliza Margaretha609a5be2014-12-18 16:52:20 +000070 while (hasMoreSpans || !matchList.isEmpty()) {
71 if (!matchList.isEmpty()) {
72 setMatchProperties();
73 return true;
74 }
75 if (prepareLists())
76 setMatchList();
77 }
78 return false;
79 }
Nils Diewald34eaa862014-06-03 10:56:27 +000080
Nils Diewaldbb33da22015-03-04 16:24:25 +000081
Eliza Margaretha609a5be2014-12-18 16:52:20 +000082 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +000083 * Updates the firstSpanList and secondSpanList by adding the next
84 * possible
85 * first and second spans. Both the spans must be in the same
86 * document. In
87 * UnorderedElementDistanceSpans, a span that is not in an element
88 * (distance
89 * unit), is not added to its candidate list. The element must
90 * also be in
Eliza Margaretha609a5be2014-12-18 16:52:20 +000091 * the same document.
92 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000093 * @return <code>true</code> if at least one of the candidate
94 * lists can be
Eliza Margaretha609a5be2014-12-18 16:52:20 +000095 * filled, <code>false</code> otherwise.
96 * @throws IOException
97 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000098 protected abstract boolean prepareLists () throws IOException;
99
Eliza Margaretha1751dbf2014-02-03 09:47:19 +0000100
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000101 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000102 * Sets the list of matches for the span having the smallest
margaretha50110f32015-05-12 18:21:29 +0200103 * position (i.e. between the first and the second spans), and its
104 * candidates (i.e. its counterparts). The candidates also must
105 * have smaller positions. Simply remove the span if it does not
106 * have any candidates.
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000107 *
108 * @throws IOException
109 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000110 protected void setMatchList () throws IOException {
Eliza Margaretha1751dbf2014-02-03 09:47:19 +0000111
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000112 hasMoreFirstSpans = setCandidateList(firstSpanList, firstSpans,
113 hasMoreFirstSpans, secondSpanList);
114 hasMoreSecondSpans = setCandidateList(secondSpanList, secondSpans,
115 hasMoreSecondSpans, firstSpanList);
116 // System.out.println("--------------------");
117 // System.out.println("firstSpanList:");
margaretha50110f32015-05-12 18:21:29 +0200118 // for (CandidateSpan cs : firstSpanList) {
119 // System.out.println(cs.getStart() + " " + cs.getEnd());
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000120 // }
121 //
122 // System.out.println("secondSpanList:");
margaretha50110f32015-05-12 18:21:29 +0200123 // for (CandidateSpan cs : secondSpanList) {
124 // System.out.println(cs.getStart() + " " + cs.getEnd());
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000125 // }
126
127 CandidateSpan currentFirstSpan, currentSecondSpan;
128 if (!firstSpanList.isEmpty() && !secondSpanList.isEmpty()) {
129
130 currentFirstSpan = firstSpanList.get(0);
131 currentSecondSpan = secondSpanList.get(0);
132
margaretha50110f32015-05-12 18:21:29 +0200133 if (currentFirstSpan.getStart() < currentSecondSpan.getStart()
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000134 || isLastCandidateSmaller(currentFirstSpan,
135 currentSecondSpan)) {
136 // log.trace("current target: "
137 // + firstSpanList.get(0).getStart() + " "
138 // + firstSpanList.get(0).getEnd());
139 // System.out.println("candidates:");
140 // for (CandidateSpan cs: secondSpanList) {
141 // System.out.println(cs.getStart() +" "+ cs.getEnd());
142 // }
143
144 matchList = findMatches(currentFirstSpan, secondSpanList);
145 setMatchFirstSpan(currentFirstSpan);
146 matchListSpanNum = 2;
147 updateList(firstSpanList);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000148 }
149 else {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000150 // log.trace("current target: "
151 // + secondSpanList.get(0).getStart() + " "
152 // + secondSpanList.get(0).getEnd());
153 // System.out.println("candidates:");
154 // for (CandidateSpan cs: firstSpanList) {
155 // System.out.println(cs.getStart() +" "+ cs.getEnd());
156 // }
157
158 matchList = findMatches(currentSecondSpan, firstSpanList);
159 setMatchSecondSpan(currentSecondSpan);
160 matchListSpanNum = 1;
161 updateList(secondSpanList);
162 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000163 }
164 else if (firstSpanList.isEmpty()) {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000165 // log.trace("current target: " + secondSpanList.get(0).getStart()
166 // + " " + secondSpanList.get(0).getEnd());
167 // log.trace("candidates: empty");
168 updateList(secondSpanList);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000169 }
170 else {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000171 // log.trace("current target: " + firstSpanList.get(0).getStart()
172 // + " " + firstSpanList.get(0).getEnd());
173 // log.trace("candidates: empty");
174 updateList(firstSpanList);
175 }
176 }
177
Nils Diewaldbb33da22015-03-04 16:24:25 +0000178
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000179 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000180 * Tells if the last candidate from the secondSpanList has a
margaretha50110f32015-05-12 18:21:29 +0200181 * smaller end position than the end position of the the last
182 * candidate from the firstSpanList.
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000183 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000184 * @param currentFirstSpan
185 * the current firstspan
186 * @param currentSecondSpan
187 * the current secondspan
188 * @return <code>true</code> if the end position of the last
margaretha50110f32015-05-12 18:21:29 +0200189 * candidate from the secondSpanList is smaller than that
190 * from the firstSpanList, <code>false</code> otherwise.
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000191 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000192 private boolean isLastCandidateSmaller (CandidateSpan currentFirstSpan,
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000193 CandidateSpan currentSecondSpan) {
194 if (currentFirstSpan.getEnd() == currentSecondSpan.getEnd()) {
195 int secondEnd = secondSpanList.get(secondSpanList.size() - 1)
196 .getEnd();
197 int firstEnd = firstSpanList.get(firstSpanList.size() - 1).getEnd();
198 return (secondEnd < firstEnd ? true : false);
199 }
200
201 return false;
202 }
203
Nils Diewaldbb33da22015-03-04 16:24:25 +0000204
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000205 /**
206 * Performs an update based on the given candidateList. In
207 * {@link UnorderedTokenDistanceSpans}, the first candidate in the
Nils Diewaldbb33da22015-03-04 16:24:25 +0000208 * candidateList is simply removed. In
209 * {@link UnorderedElementDistanceSpans} , the elementList is also
210 * updated.
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000211 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000212 * @param candidateList
213 * a candidateList
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000214 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000215 protected abstract void updateList (List<CandidateSpan> candidateList);
216
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000217
218 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000219 * Sets the candidate list for the first element in the target
220 * list and
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000221 * tells if the the specified spans has finished or not.
222 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000223 * @param candidateList
224 * a list of candidate spans
225 * @param candidate
226 * a Spans
227 * @param hasMoreCandidates
228 * a boolean
229 * @param targetList
230 * a list of target spans
231 * @return <code>true</code> if the span enumeration still has a
232 * next
233 * element to be a candidate, <code>false</code>
234 * otherwise.
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000235 * @throws IOException
236 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000237 protected abstract boolean setCandidateList (
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000238 List<CandidateSpan> candidateList, Spans candidate,
239 boolean hasMoreCandidates, List<CandidateSpan> targetList)
240 throws IOException;
241
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000242
243 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000244 * Finds all matches between the target span and its candidates in
245 * the
246 * candidate list.
247 *
248 * @param target
249 * a target span
250 * @param candidateList
251 * a candidate list
252 * @return the matches in a list
253 */
254 protected abstract List<CandidateSpan> findMatches (CandidateSpan target,
255 List<CandidateSpan> candidateList);
256
257
258 /**
259 * Computes match properties and creates a candidate span match to
260 * be added
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000261 * to the match list.
262 *
263 * @return a candidate span match
264 * */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000265 protected CandidateSpan createMatchCandidate (CandidateSpan target,
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000266 CandidateSpan cs, boolean isDistanceZero) {
267
268 int start = Math.min(target.getStart(), cs.getStart());
269 int end = Math.max(target.getEnd(), cs.getEnd());
270 int doc = target.getDoc();
271 long cost = target.getCost() + cs.getCost();
272
273 Collection<byte[]> payloads = new LinkedList<byte[]>();
274 if (collectPayloads) {
275 if (target.getPayloads() != null) {
276 payloads.addAll(target.getPayloads());
277 }
278 if (cs.getPayloads() != null) {
279 payloads.addAll(cs.getPayloads());
280 }
281 }
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000282 CandidateSpan match = new CandidateSpan(start, end, doc, cost, payloads);
283 match.setChildSpan(cs);
284 return match;
285 }
286
Nils Diewaldbb33da22015-03-04 16:24:25 +0000287
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000288 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000289 * Assigns the first candidate span in the match list as the
290 * current span
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000291 * match, and removes it from the matchList.
292 * */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000293 private void setMatchProperties () {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000294 CandidateSpan cs = matchList.get(0);
295 matchDocNumber = cs.getDoc();
296 matchStartPosition = cs.getStart();
297 matchEndPosition = cs.getEnd();
298 matchCost = cs.getCost();
299 matchPayload.addAll(cs.getPayloads());
300 matchList.remove(0);
301
302 if (matchListSpanNum == 1)
303 setMatchFirstSpan(cs.getChildSpan());
304 else
305 setMatchSecondSpan(cs.getChildSpan());
306
307 // log.trace("Match doc#={} start={} end={}", matchDocNumber,
308 // matchStartPosition, matchEndPosition);
309 // log.trace("firstspan " + getMatchFirstSpan().getStart() + " "
310 // + getMatchFirstSpan().getEnd());
311 // log.trace("secondspan " + getMatchSecondSpan().getStart() + " "
312 // + getMatchSecondSpan().getEnd());
313 }
314
Nils Diewaldbb33da22015-03-04 16:24:25 +0000315
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000316 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000317 public boolean skipTo (int target) throws IOException {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000318 if (hasMoreSpans && (secondSpans.doc() < target)) {
319 if (!secondSpans.skipTo(target)) {
320 hasMoreSpans = false;
321 return false;
322 }
323 }
324
325 firstSpanList.clear();
326 secondSpanList.clear();
327 matchPayload.clear();
328 isStartEnumeration = false;
329 return advance();
330 }
331
Nils Diewaldbb33da22015-03-04 16:24:25 +0000332
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000333 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000334 public long cost () {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000335 return matchCost;
336 }
Eliza Margaretha1751dbf2014-02-03 09:47:19 +0000337
338}