blob: 4db92124d09b904b0037b905153e0642724c17fb [file] [log] [blame]
Eliza Margaretha1751dbf2014-02-03 09:47:19 +00001package de.ids_mannheim.korap.query.spans;
2
3import java.io.IOException;
Eliza Margaretha9738c392014-02-03 17:04:53 +00004import java.util.ArrayList;
5import java.util.Collection;
margarethac66265c2016-12-14 13:48:45 +01006import java.util.Collections;
Eliza Margaretha9738c392014-02-03 17:04:53 +00007import java.util.LinkedList;
8import java.util.List;
Eliza Margaretha1751dbf2014-02-03 09:47:19 +00009import java.util.Map;
10
margarethabb5c26c2019-03-12 13:54:22 +010011import org.apache.logging.log4j.LogManager;
12import org.apache.logging.log4j.Logger;
Akron700c1eb2015-09-25 16:57:30 +020013import org.apache.lucene.index.LeafReaderContext;
Eliza Margaretha1751dbf2014-02-03 09:47:19 +000014import org.apache.lucene.index.Term;
15import org.apache.lucene.index.TermContext;
Eliza Margaretha9738c392014-02-03 17:04:53 +000016import org.apache.lucene.search.spans.Spans;
Eliza Margaretha1751dbf2014-02-03 09:47:19 +000017import org.apache.lucene.util.Bits;
18
19import de.ids_mannheim.korap.query.SpanDistanceQuery;
20
Eliza Margaretha609a5be2014-12-18 16:52:20 +000021/**
Nils Diewaldbb33da22015-03-04 16:24:25 +000022 * Enumeration of span matches, whose two child spans have a specific
margarethac66265c2016-12-14 13:48:45 +010023 * range of distance (within a minimum and a maximum distance) and can
24 * occur in any order.
Eliza Margaretha609a5be2014-12-18 16:52:20 +000025 *
26 * @author margaretha
Eliza Margaretha6f989202016-10-14 21:48:29 +020027 */
Eliza Margaretha609a5be2014-12-18 16:52:20 +000028public abstract class UnorderedDistanceSpans extends DistanceSpans {
Eliza Margaretha1751dbf2014-02-03 09:47:19 +000029
Eliza Margaretha609a5be2014-12-18 16:52:20 +000030 protected int minDistance, maxDistance;
31 protected boolean hasMoreFirstSpans, hasMoreSecondSpans;
32 protected List<CandidateSpan> firstSpanList, secondSpanList;
33 protected List<CandidateSpan> matchList;
34 private long matchCost;
Eliza Margaretha609a5be2014-12-18 16:52:20 +000035 protected int currentDocNum;
Nils Diewald34eaa862014-06-03 10:56:27 +000036
Eliza Margaretha609a5be2014-12-18 16:52:20 +000037 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +000038 * Constructs UnorderedDistanceSpans for the given
39 * {@link SpanDistanceQuery} .
Eliza Margaretha609a5be2014-12-18 16:52:20 +000040 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000041 * @param query
42 * a SpanDistanceQuery
Eliza Margaretha609a5be2014-12-18 16:52:20 +000043 * @param context
44 * @param acceptDocs
45 * @param termContexts
46 * @throws IOException
47 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000048 public UnorderedDistanceSpans (SpanDistanceQuery query,
Akron42993552016-02-04 13:24:24 +010049 LeafReaderContext context, Bits acceptDocs,
Nils Diewaldbb33da22015-03-04 16:24:25 +000050 Map<Term, TermContext> termContexts)
51 throws IOException {
Eliza Margaretha609a5be2014-12-18 16:52:20 +000052 super(query, context, acceptDocs, termContexts);
53 minDistance = query.getMinDistance();
54 maxDistance = query.getMaxDistance();
Nils Diewald34eaa862014-06-03 10:56:27 +000055
Eliza Margaretha609a5be2014-12-18 16:52:20 +000056 firstSpanList = new ArrayList<CandidateSpan>();
57 secondSpanList = new ArrayList<CandidateSpan>();
58 matchList = new ArrayList<CandidateSpan>();
Eliza Margaretha1751dbf2014-02-03 09:47:19 +000059
Eliza Margaretha609a5be2014-12-18 16:52:20 +000060 hasMoreFirstSpans = firstSpans.next();
61 hasMoreSecondSpans = secondSpans.next();
62 hasMoreSpans = hasMoreFirstSpans && hasMoreSecondSpans;
63 }
Eliza Margaretha9738c392014-02-03 17:04:53 +000064
Eliza Margaretha609a5be2014-12-18 16:52:20 +000065 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +000066 protected boolean advance () throws IOException {
Eliza Margaretha609a5be2014-12-18 16:52:20 +000067 while (hasMoreSpans || !matchList.isEmpty()) {
68 if (!matchList.isEmpty()) {
69 setMatchProperties();
70 return true;
71 }
margarethabb5c26c2019-03-12 13:54:22 +010072 if (prepareLists()) setMatchList();
Eliza Margaretha609a5be2014-12-18 16:52:20 +000073 }
74 return false;
75 }
Nils Diewald34eaa862014-06-03 10:56:27 +000076
Eliza Margaretha609a5be2014-12-18 16:52:20 +000077 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +000078 * Updates the firstSpanList and secondSpanList by adding the next
margarethac66265c2016-12-14 13:48:45 +010079 * possible first and second spans. Both the spans must be in the
80 * same document. In UnorderedElementDistanceSpans, a span that is
81 * not in an element (distance unit), is not added to its
82 * candidate list. The element must also be in the same document.
Eliza Margaretha609a5be2014-12-18 16:52:20 +000083 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000084 * @return <code>true</code> if at least one of the candidate
85 * lists can be
Eliza Margaretha609a5be2014-12-18 16:52:20 +000086 * filled, <code>false</code> otherwise.
87 * @throws IOException
88 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000089 protected abstract boolean prepareLists () throws IOException;
90
Eliza Margaretha609a5be2014-12-18 16:52:20 +000091 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +000092 * Sets the list of matches for the span having the smallest
margaretha50110f32015-05-12 18:21:29 +020093 * position (i.e. between the first and the second spans), and its
94 * candidates (i.e. its counterparts). The candidates also must
95 * have smaller positions. Simply remove the span if it does not
96 * have any candidates.
Eliza Margaretha609a5be2014-12-18 16:52:20 +000097 *
98 * @throws IOException
99 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000100 protected void setMatchList () throws IOException {
Eliza Margaretha1751dbf2014-02-03 09:47:19 +0000101
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000102 hasMoreFirstSpans = setCandidateList(firstSpanList, firstSpans,
103 hasMoreFirstSpans, secondSpanList);
104 hasMoreSecondSpans = setCandidateList(secondSpanList, secondSpans,
105 hasMoreSecondSpans, firstSpanList);
margarethabb5c26c2019-03-12 13:54:22 +0100106 // System.out.println("--------------------");
107 // System.out.println("firstSpanList:");
108 // for (CandidateSpan cs : firstSpanList) {
109 // System.out.println(cs.getStart() + " " + cs.getEnd());
110 // }
111 //
112 // System.out.println("secondSpanList:");
113 // for (CandidateSpan cs : secondSpanList) {
114 // System.out.println(cs.getStart() + " " + cs.getEnd());
115 // }
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000116
117 CandidateSpan currentFirstSpan, currentSecondSpan;
118 if (!firstSpanList.isEmpty() && !secondSpanList.isEmpty()) {
119
120 currentFirstSpan = firstSpanList.get(0);
121 currentSecondSpan = secondSpanList.get(0);
122
margaretha50110f32015-05-12 18:21:29 +0200123 if (currentFirstSpan.getStart() < currentSecondSpan.getStart()
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000124 || isLastCandidateSmaller(currentFirstSpan,
125 currentSecondSpan)) {
margarethac66265c2016-12-14 13:48:45 +0100126 matchList = findMatches(currentFirstSpan, secondSpanList, true);
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000127 updateList(firstSpanList);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000128 }
129 else {
margarethabb5c26c2019-03-12 13:54:22 +0100130 matchList =
131 findMatches(currentSecondSpan, firstSpanList, false);
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000132 updateList(secondSpanList);
margarethac66265c2016-12-14 13:48:45 +0100133
134 if (currentFirstSpan.getStart() == currentSecondSpan.getStart()
135 && currentFirstSpan.getEnd() == currentSecondSpan
136 .getEnd()) {
137 matchList.addAll(findMatches(currentFirstSpan,
138 secondSpanList, false));
139 Collections.sort(matchList);
140 updateList(firstSpanList);
141 }
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000142 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000143 }
margarethabb5c26c2019-03-12 13:54:22 +0100144 else if (!secondSpanList.isEmpty()) {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000145 updateList(secondSpanList);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000146 }
margarethabb5c26c2019-03-12 13:54:22 +0100147 else if (!firstSpanList.isEmpty()) {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000148 updateList(firstSpanList);
149 }
150 }
151
152 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000153 * Tells if the last candidate from the secondSpanList has a
margaretha50110f32015-05-12 18:21:29 +0200154 * smaller end position than the end position of the the last
155 * candidate from the firstSpanList.
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000156 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000157 * @param currentFirstSpan
158 * the current firstspan
159 * @param currentSecondSpan
160 * the current secondspan
161 * @return <code>true</code> if the end position of the last
margaretha50110f32015-05-12 18:21:29 +0200162 * candidate from the secondSpanList is smaller than that
163 * from the firstSpanList, <code>false</code> otherwise.
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000164 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000165 private boolean isLastCandidateSmaller (CandidateSpan currentFirstSpan,
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000166 CandidateSpan currentSecondSpan) {
167 if (currentFirstSpan.getEnd() == currentSecondSpan.getEnd()) {
margarethabb5c26c2019-03-12 13:54:22 +0100168 int secondEnd =
169 secondSpanList.get(secondSpanList.size() - 1).getEnd();
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000170 int firstEnd = firstSpanList.get(firstSpanList.size() - 1).getEnd();
171 return (secondEnd < firstEnd ? true : false);
172 }
173
174 return false;
175 }
176
177 /**
178 * Performs an update based on the given candidateList. In
179 * {@link UnorderedTokenDistanceSpans}, the first candidate in the
Nils Diewaldbb33da22015-03-04 16:24:25 +0000180 * candidateList is simply removed. In
181 * {@link UnorderedElementDistanceSpans} , the elementList is also
182 * updated.
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000183 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000184 * @param candidateList
185 * a candidateList
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000186 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000187 protected abstract void updateList (List<CandidateSpan> candidateList);
188
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000189 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000190 * Sets the candidate list for the first element in the target
191 * list and
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000192 * tells if the the specified spans has finished or not.
193 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000194 * @param candidateList
195 * a list of candidate spans
196 * @param candidate
197 * a Spans
198 * @param hasMoreCandidates
199 * a boolean
200 * @param targetList
201 * a list of target spans
202 * @return <code>true</code> if the span enumeration still has a
203 * next
204 * element to be a candidate, <code>false</code>
205 * otherwise.
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000206 * @throws IOException
207 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000208 protected abstract boolean setCandidateList (
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000209 List<CandidateSpan> candidateList, Spans candidate,
210 boolean hasMoreCandidates, List<CandidateSpan> targetList)
211 throws IOException;
212
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000213 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000214 * Finds all matches between the target span and its candidates in
215 * the
216 * candidate list.
217 *
218 * @param target
219 * a target span
220 * @param candidateList
221 * a candidate list
margarethac66265c2016-12-14 13:48:45 +0100222 * @param isTargetFirstSpan
223 * true is the target span is of the first span, false
224 * otherwise
Nils Diewaldbb33da22015-03-04 16:24:25 +0000225 * @return the matches in a list
226 */
227 protected abstract List<CandidateSpan> findMatches (CandidateSpan target,
margarethac66265c2016-12-14 13:48:45 +0100228 List<CandidateSpan> candidateList, boolean isTargetFirstSpan);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000229
Nils Diewaldbb33da22015-03-04 16:24:25 +0000230 /**
231 * Computes match properties and creates a candidate span match to
margarethac66265c2016-12-14 13:48:45 +0100232 * be added to the match list.
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000233 *
234 * @return a candidate span match
Eliza Margaretha6f989202016-10-14 21:48:29 +0200235 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000236 protected CandidateSpan createMatchCandidate (CandidateSpan target,
margarethabb5c26c2019-03-12 13:54:22 +0100237 CandidateSpan cs, boolean isDistanceZero,
238 boolean isTargetFirstSpan) {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000239
240 int start = Math.min(target.getStart(), cs.getStart());
241 int end = Math.max(target.getEnd(), cs.getEnd());
242 int doc = target.getDoc();
243 long cost = target.getCost() + cs.getCost();
244
245 Collection<byte[]> payloads = new LinkedList<byte[]>();
246 if (collectPayloads) {
247 if (target.getPayloads() != null) {
248 payloads.addAll(target.getPayloads());
249 }
250 if (cs.getPayloads() != null) {
251 payloads.addAll(cs.getPayloads());
252 }
253 }
margarethabb5c26c2019-03-12 13:54:22 +0100254 CandidateSpan match =
255 new CandidateSpan(start, end, doc, cost, payloads);
margaretha35120872016-12-19 18:24:22 +0100256 if (isTargetFirstSpan) {
257 match.setChildSpan(target);
258 match.setSecondChildSpan(cs);
259 }
260 else {
261 match.setChildSpan(cs);
262 match.setSecondChildSpan(target);
263 }
margarethabb5c26c2019-03-12 13:54:22 +0100264 // match.setChildSpan(cs);
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000265 return match;
266 }
267
268 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000269 * Assigns the first candidate span in the match list as the
margarethac66265c2016-12-14 13:48:45 +0100270 * current span match, and removes it from the matchList.
Eliza Margaretha6f989202016-10-14 21:48:29 +0200271 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000272 private void setMatchProperties () {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000273 CandidateSpan cs = matchList.get(0);
274 matchDocNumber = cs.getDoc();
275 matchStartPosition = cs.getStart();
276 matchEndPosition = cs.getEnd();
277 matchCost = cs.getCost();
278 matchPayload.addAll(cs.getPayloads());
279 matchList.remove(0);
280
margarethac66265c2016-12-14 13:48:45 +0100281 setMatchFirstSpan(cs.getChildSpan());
282 setMatchSecondSpan(cs.getSecondChildSpan());
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000283
284 // log.trace("Match doc#={} start={} end={}", matchDocNumber,
285 // matchStartPosition, matchEndPosition);
margarethabb5c26c2019-03-12 13:54:22 +0100286 // log.trace("firstspan " + getMatchFirstSpan().getStart() + "
287 // "
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000288 // + getMatchFirstSpan().getEnd());
margarethabb5c26c2019-03-12 13:54:22 +0100289 // log.trace("secondspan " + getMatchSecondSpan().getStart() +
290 // " "
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000291 // + getMatchSecondSpan().getEnd());
292 }
293
294 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000295 public boolean skipTo (int target) throws IOException {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000296 if (hasMoreSpans && (secondSpans.doc() < target)) {
297 if (!secondSpans.skipTo(target)) {
298 hasMoreSpans = false;
299 return false;
300 }
301 }
302
303 firstSpanList.clear();
304 secondSpanList.clear();
305 matchPayload.clear();
306 isStartEnumeration = false;
307 return advance();
308 }
309
310 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000311 public long cost () {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000312 return matchCost;
313 }
Eliza Margaretha1751dbf2014-02-03 09:47:19 +0000314
315}