blob: 0fec0af68499a404c258e2d0db98d892be16f3d4 [file] [log] [blame]
Eliza Margaretha1413e0f2014-02-06 13:01:29 +00001package de.ids_mannheim.korap.query.spans;
2
3import java.io.IOException;
4import java.util.ArrayList;
5import java.util.Iterator;
6import java.util.List;
7import java.util.Map;
8
Akron700c1eb2015-09-25 16:57:30 +02009import org.apache.lucene.index.LeafReaderContext;
Eliza Margaretha1413e0f2014-02-06 13:01:29 +000010import org.apache.lucene.index.Term;
11import org.apache.lucene.index.TermContext;
12import org.apache.lucene.search.spans.Spans;
13import org.apache.lucene.util.Bits;
14
15import de.ids_mannheim.korap.query.SpanDistanceQuery;
16
Eliza Margaretha609a5be2014-12-18 16:52:20 +000017/**
Nils Diewaldbb33da22015-03-04 16:24:25 +000018 * Enumeration of span matches, whose two child spans have a specific
19 * range of
20 * distance (within a min and a max distance) and can be in any order.
21 * The unit
22 * distance is an element, which can be a sentence or a paragraph for
23 * instance.
24 * The distance is the difference between the positions of elements
25 * containing
Eliza Margaretha609a5be2014-12-18 16:52:20 +000026 * the spans.
Eliza Margaretha1413e0f2014-02-06 13:01:29 +000027 *
28 * @author margaretha
29 * */
Eliza Margaretha609a5be2014-12-18 16:52:20 +000030public class UnorderedElementDistanceSpans extends UnorderedDistanceSpans {
Eliza Margaretha1413e0f2014-02-06 13:01:29 +000031
Eliza Margaretha609a5be2014-12-18 16:52:20 +000032 private Spans elements;
33 private boolean hasMoreElements;
34 private int elementPosition;
Eliza Margaretha1413e0f2014-02-06 13:01:29 +000035
Eliza Margaretha609a5be2014-12-18 16:52:20 +000036 // contains all previous elements whose position is greater than the last
37 // target span
38 private List<CandidateSpan> elementList;
39
Nils Diewaldbb33da22015-03-04 16:24:25 +000040
Eliza Margaretha609a5be2014-12-18 16:52:20 +000041 /**
Eliza Margaretha7612bde2015-01-14 10:28:42 +000042 * Constructs UnorderedElementDistanceSpans for the given
Eliza Margaretha609a5be2014-12-18 16:52:20 +000043 * {@link SpanDistanceQuery}.
44 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000045 * @param query
46 * a SpanDistanceQuery
Eliza Margaretha609a5be2014-12-18 16:52:20 +000047 * @param context
48 * @param acceptDocs
49 * @param termContexts
50 * @throws IOException
51 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000052 public UnorderedElementDistanceSpans (SpanDistanceQuery query,
Akron700c1eb2015-09-25 16:57:30 +020053 LeafReaderContext context,
Nils Diewaldbb33da22015-03-04 16:24:25 +000054 Bits acceptDocs,
55 Map<Term, TermContext> termContexts)
56 throws IOException {
Eliza Margaretha609a5be2014-12-18 16:52:20 +000057 super(query, context, acceptDocs, termContexts);
58 elements = query.getElementQuery().getSpans(context, acceptDocs,
59 termContexts);
60 hasMoreElements = elements.next();
61 elementPosition = 0;
62 elementList = new ArrayList<CandidateSpan>();
63 }
64
Nils Diewaldbb33da22015-03-04 16:24:25 +000065
Eliza Margaretha609a5be2014-12-18 16:52:20 +000066 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +000067 protected boolean prepareLists () throws IOException {
Eliza Margaretha609a5be2014-12-18 16:52:20 +000068
69 if (firstSpanList.isEmpty() && secondSpanList.isEmpty()) {
70 if (hasMoreFirstSpans && hasMoreSecondSpans && hasMoreElements
71 && findSameDoc(firstSpans, secondSpans, elements)) {
72
73 if (currentDocNum != firstSpans.doc()) {
74 currentDocNum = firstSpans.doc();
75 elementList.clear();
76 }
77
78 hasMoreFirstSpans = addSpan(firstSpans, firstSpanList,
79 hasMoreFirstSpans);
80 hasMoreSecondSpans = addSpan(secondSpans, secondSpanList,
81 hasMoreSecondSpans);
Nils Diewaldbb33da22015-03-04 16:24:25 +000082 }
83 else {
Eliza Margaretha609a5be2014-12-18 16:52:20 +000084 hasMoreSpans = false;
85 return false;
86 }
Nils Diewaldbb33da22015-03-04 16:24:25 +000087 }
88 else if (firstSpanList.isEmpty() && hasMoreFirstSpans
Eliza Margaretha609a5be2014-12-18 16:52:20 +000089 && firstSpans.doc() == currentDocNum) {
90 hasMoreFirstSpans = addSpan(firstSpans, firstSpanList,
91 hasMoreFirstSpans);
Nils Diewaldbb33da22015-03-04 16:24:25 +000092 }
93 else if (secondSpanList.isEmpty() && hasMoreSecondSpans
Eliza Margaretha609a5be2014-12-18 16:52:20 +000094 && secondSpans.doc() == currentDocNum) {
95 hasMoreSecondSpans = addSpan(secondSpans, secondSpanList,
96 hasMoreSecondSpans);
97 }
98
99 return true;
100 }
101
Nils Diewaldbb33da22015-03-04 16:24:25 +0000102
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000103 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000104 * Adds all the spans occurring in the current document, as
105 * CandidateSpans
106 * to the specified candidate list, and tells if the enumeration
107 * of the
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000108 * spans has finished, or not.
109 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000110 * @param span
111 * a Span
112 * @param list
113 * a candidateList
114 * @param hasMoreSpan
115 * a boolean describing if the span enumeration has
116 * finished or not.
117 * @return <code>true</code> if the the span enumeration has
118 * finished,
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000119 * <code>false</code> otherwise.
120 * @throws IOException
121 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000122 private boolean addSpan (Spans span, List<CandidateSpan> list,
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000123 boolean hasMoreSpan) throws IOException {
124 int position;
125 while (hasMoreSpan && span.doc() == currentDocNum) {
126 position = findElementPosition(span);
127 if (position != -1) {
128 list.add(new CandidateSpan(span, position));
129 hasMoreSpan = span.next();
130 return hasMoreSpan;
131 }
132 hasMoreSpan = span.next();
133 }
134 return hasMoreSpan;
135 }
136
Nils Diewaldbb33da22015-03-04 16:24:25 +0000137
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000138 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000139 * Finds the element position of the specified span in the element
140 * list or
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000141 * by advancing the element spans until encountering the span.
142 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000143 * @param span
144 * a Span
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000145 * @return the element position
146 * @throws IOException
147 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000148 private int findElementPosition (Spans span) throws IOException {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000149 // Check in the element list
150 if (!elementList.isEmpty()
151 && span.end() <= elementList.get(elementList.size() - 1)
152 .getEnd()) {
153
154 for (CandidateSpan e : elementList)
155 if (e.getEnd() >= span.end() && e.getStart() <= span.start()) {
156 return e.getPosition();
157 }
158 return -1; // The span is not in an element.
159 }
160
161 return (advanceElementTo(span) ? elementPosition : -1);
162 }
163
Nils Diewaldbb33da22015-03-04 16:24:25 +0000164
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000165 /**
166 * Advances the element spans until encountering the given span.
167 *
168 * @param span
Nils Diewaldbb33da22015-03-04 16:24:25 +0000169 * @return <code>true</code> if such an element is found,
170 * <code>false</code>
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000171 * if the span is not in an element.
172 * @throws IOException
173 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000174 private boolean advanceElementTo (Spans span) throws IOException {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000175 while (hasMoreElements && elements.doc() == currentDocNum
176 && elements.start() < span.end()) {
177
178 if (span.start() >= elements.start()
179 && span.end() <= elements.end()) {
180 return true;
181 }
182 elementList.add(new CandidateSpan(elements, elementPosition));
183 hasMoreElements = elements.next();
184 elementPosition++;
185 }
186
187 return false; // invalid
188 }
189
Nils Diewaldbb33da22015-03-04 16:24:25 +0000190
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000191 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000192 protected boolean setCandidateList (List<CandidateSpan> candidateList,
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000193 Spans candidate, boolean hasMoreCandidates,
194 List<CandidateSpan> targetList) throws IOException {
195
196 if (!targetList.isEmpty()) {
197 CandidateSpan cs;
198 CandidateSpan target = targetList.get(0);
199 int position;
200 while (hasMoreCandidates && candidate.doc() == target.getDoc()) {
201 position = findElementPosition(candidate);
202 if (position != -1) {
203 cs = new CandidateSpan(candidate, position);
204
205 if (isWithinMaxDistance(target, cs)) {
206 candidateList.add(cs);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000207 }
208 else
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000209 break;
210 }
211 hasMoreCandidates = candidate.next();
212 }
213 }
214 return hasMoreCandidates;
215 }
216
Nils Diewaldbb33da22015-03-04 16:24:25 +0000217
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000218 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000219 * Tells if the target and candidate spans are not too far from
220 * each other
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000221 * (within the maximum distance).
222 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000223 * @return <code>true</code> if the target and candidate spans are
224 * within
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000225 * the maximum distance, <code>false</code> otherwise.
226 * */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000227 protected boolean isWithinMaxDistance (CandidateSpan target,
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000228 CandidateSpan candidate) {
229 int candidatePos = candidate.getPosition();
230 int targetPos = target.getPosition();
231
232 // left candidate
233 if (candidatePos < targetPos && candidatePos + maxDistance < targetPos) {
234 return false;
235 }
236 // right candidate
237 if (candidatePos > targetPos && targetPos + maxDistance < candidatePos) {
238 return false;
239 }
240 return true;
241 }
242
Nils Diewaldbb33da22015-03-04 16:24:25 +0000243
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000244 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000245 protected List<CandidateSpan> findMatches (CandidateSpan target,
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000246 List<CandidateSpan> candidateList) {
247
248 List<CandidateSpan> matches = new ArrayList<>();
249
250 int actualDistance;
251 int targetPos = target.getPosition();
252
253 for (CandidateSpan cs : candidateList) {
254 actualDistance = Math.abs(targetPos - cs.getPosition());
255
256 if (minDistance == 0 && actualDistance == 0) {
257 matches.add(createMatchCandidate(target, cs, true));
258 continue;
259 }
260
261 if (minDistance <= actualDistance && actualDistance <= maxDistance)
262 matches.add(createMatchCandidate(target, cs, false));
263 }
264 return matches;
265 }
266
Nils Diewaldbb33da22015-03-04 16:24:25 +0000267
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000268 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000269 protected void updateList (List<CandidateSpan> candidateList) {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000270 updateElementList(candidateList.get(0).getPosition());
271 candidateList.remove(0);
272 }
273
Nils Diewaldbb33da22015-03-04 16:24:25 +0000274
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000275 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000276 * Reduces the number of elements kept in the element list by
277 * removing the
278 * elements whose position is smaller than or identical to the
279 * position of
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000280 * the last target span.
281 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000282 * @param position
283 * the last target span position
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000284 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000285 private void updateElementList (int position) {
Eliza Margaretha609a5be2014-12-18 16:52:20 +0000286 Iterator<CandidateSpan> i = elementList.iterator();
287 CandidateSpan e;
288 while (i.hasNext()) {
289 e = i.next();
290 if (e.getPosition() <= position) {
291 i.remove();
292 }
293 break;
294 }
295 }
Eliza Margaretha1413e0f2014-02-06 13:01:29 +0000296}