| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| 4 | import java.util.ArrayList; |
| 5 | import java.util.Iterator; |
| 6 | import java.util.List; |
| 7 | import java.util.Map; |
| 8 | |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 9 | import org.apache.lucene.index.LeafReaderContext; |
| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 10 | import org.apache.lucene.index.Term; |
| 11 | import org.apache.lucene.index.TermContext; |
| 12 | import org.apache.lucene.search.spans.Spans; |
| 13 | import org.apache.lucene.util.Bits; |
| 14 | |
| 15 | import de.ids_mannheim.korap.query.SpanDistanceQuery; |
| 16 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 17 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 18 | * Enumeration of span matches, whose two child spans have a specific |
| 19 | * range of |
| 20 | * distance (within a min and a max distance) and can be in any order. |
| 21 | * The unit |
| 22 | * distance is an element, which can be a sentence or a paragraph for |
| 23 | * instance. |
| 24 | * The distance is the difference between the positions of elements |
| 25 | * containing |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 26 | * the spans. |
| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 27 | * |
| 28 | * @author margaretha |
| 29 | * */ |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 30 | public class UnorderedElementDistanceSpans extends UnorderedDistanceSpans { |
| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 31 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 32 | private Spans elements; |
| 33 | private boolean hasMoreElements; |
| 34 | private int elementPosition; |
| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 35 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 36 | // contains all previous elements whose position is greater than the last |
| 37 | // target span |
| 38 | private List<CandidateSpan> elementList; |
| 39 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 40 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 41 | /** |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 42 | * Constructs UnorderedElementDistanceSpans for the given |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 43 | * {@link SpanDistanceQuery}. |
| 44 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 45 | * @param query |
| 46 | * a SpanDistanceQuery |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 47 | * @param context |
| 48 | * @param acceptDocs |
| 49 | * @param termContexts |
| 50 | * @throws IOException |
| 51 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 52 | public UnorderedElementDistanceSpans (SpanDistanceQuery query, |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 53 | LeafReaderContext context, |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 54 | Bits acceptDocs, |
| 55 | Map<Term, TermContext> termContexts) |
| 56 | throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 57 | super(query, context, acceptDocs, termContexts); |
| 58 | elements = query.getElementQuery().getSpans(context, acceptDocs, |
| 59 | termContexts); |
| 60 | hasMoreElements = elements.next(); |
| 61 | elementPosition = 0; |
| 62 | elementList = new ArrayList<CandidateSpan>(); |
| 63 | } |
| 64 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 65 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 66 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 67 | protected boolean prepareLists () throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 68 | |
| 69 | if (firstSpanList.isEmpty() && secondSpanList.isEmpty()) { |
| 70 | if (hasMoreFirstSpans && hasMoreSecondSpans && hasMoreElements |
| 71 | && findSameDoc(firstSpans, secondSpans, elements)) { |
| 72 | |
| 73 | if (currentDocNum != firstSpans.doc()) { |
| 74 | currentDocNum = firstSpans.doc(); |
| 75 | elementList.clear(); |
| 76 | } |
| 77 | |
| 78 | hasMoreFirstSpans = addSpan(firstSpans, firstSpanList, |
| 79 | hasMoreFirstSpans); |
| 80 | hasMoreSecondSpans = addSpan(secondSpans, secondSpanList, |
| 81 | hasMoreSecondSpans); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 82 | } |
| 83 | else { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 84 | hasMoreSpans = false; |
| 85 | return false; |
| 86 | } |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 87 | } |
| 88 | else if (firstSpanList.isEmpty() && hasMoreFirstSpans |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 89 | && firstSpans.doc() == currentDocNum) { |
| 90 | hasMoreFirstSpans = addSpan(firstSpans, firstSpanList, |
| 91 | hasMoreFirstSpans); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 92 | } |
| 93 | else if (secondSpanList.isEmpty() && hasMoreSecondSpans |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 94 | && secondSpans.doc() == currentDocNum) { |
| 95 | hasMoreSecondSpans = addSpan(secondSpans, secondSpanList, |
| 96 | hasMoreSecondSpans); |
| 97 | } |
| 98 | |
| 99 | return true; |
| 100 | } |
| 101 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 102 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 103 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 104 | * Adds all the spans occurring in the current document, as |
| 105 | * CandidateSpans |
| 106 | * to the specified candidate list, and tells if the enumeration |
| 107 | * of the |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 108 | * spans has finished, or not. |
| 109 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 110 | * @param span |
| 111 | * a Span |
| 112 | * @param list |
| 113 | * a candidateList |
| 114 | * @param hasMoreSpan |
| 115 | * a boolean describing if the span enumeration has |
| 116 | * finished or not. |
| 117 | * @return <code>true</code> if the the span enumeration has |
| 118 | * finished, |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 119 | * <code>false</code> otherwise. |
| 120 | * @throws IOException |
| 121 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 122 | private boolean addSpan (Spans span, List<CandidateSpan> list, |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 123 | boolean hasMoreSpan) throws IOException { |
| 124 | int position; |
| 125 | while (hasMoreSpan && span.doc() == currentDocNum) { |
| 126 | position = findElementPosition(span); |
| 127 | if (position != -1) { |
| 128 | list.add(new CandidateSpan(span, position)); |
| 129 | hasMoreSpan = span.next(); |
| 130 | return hasMoreSpan; |
| 131 | } |
| 132 | hasMoreSpan = span.next(); |
| 133 | } |
| 134 | return hasMoreSpan; |
| 135 | } |
| 136 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 137 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 138 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 139 | * Finds the element position of the specified span in the element |
| 140 | * list or |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 141 | * by advancing the element spans until encountering the span. |
| 142 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 143 | * @param span |
| 144 | * a Span |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 145 | * @return the element position |
| 146 | * @throws IOException |
| 147 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 148 | private int findElementPosition (Spans span) throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 149 | // Check in the element list |
| 150 | if (!elementList.isEmpty() |
| 151 | && span.end() <= elementList.get(elementList.size() - 1) |
| 152 | .getEnd()) { |
| 153 | |
| 154 | for (CandidateSpan e : elementList) |
| 155 | if (e.getEnd() >= span.end() && e.getStart() <= span.start()) { |
| 156 | return e.getPosition(); |
| 157 | } |
| 158 | return -1; // The span is not in an element. |
| 159 | } |
| 160 | |
| 161 | return (advanceElementTo(span) ? elementPosition : -1); |
| 162 | } |
| 163 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 164 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 165 | /** |
| 166 | * Advances the element spans until encountering the given span. |
| 167 | * |
| 168 | * @param span |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 169 | * @return <code>true</code> if such an element is found, |
| 170 | * <code>false</code> |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 171 | * if the span is not in an element. |
| 172 | * @throws IOException |
| 173 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 174 | private boolean advanceElementTo (Spans span) throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 175 | while (hasMoreElements && elements.doc() == currentDocNum |
| 176 | && elements.start() < span.end()) { |
| 177 | |
| 178 | if (span.start() >= elements.start() |
| 179 | && span.end() <= elements.end()) { |
| 180 | return true; |
| 181 | } |
| 182 | elementList.add(new CandidateSpan(elements, elementPosition)); |
| 183 | hasMoreElements = elements.next(); |
| 184 | elementPosition++; |
| 185 | } |
| 186 | |
| 187 | return false; // invalid |
| 188 | } |
| 189 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 190 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 191 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 192 | protected boolean setCandidateList (List<CandidateSpan> candidateList, |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 193 | Spans candidate, boolean hasMoreCandidates, |
| 194 | List<CandidateSpan> targetList) throws IOException { |
| 195 | |
| 196 | if (!targetList.isEmpty()) { |
| 197 | CandidateSpan cs; |
| 198 | CandidateSpan target = targetList.get(0); |
| 199 | int position; |
| 200 | while (hasMoreCandidates && candidate.doc() == target.getDoc()) { |
| 201 | position = findElementPosition(candidate); |
| 202 | if (position != -1) { |
| 203 | cs = new CandidateSpan(candidate, position); |
| 204 | |
| 205 | if (isWithinMaxDistance(target, cs)) { |
| 206 | candidateList.add(cs); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 207 | } |
| 208 | else |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 209 | break; |
| 210 | } |
| 211 | hasMoreCandidates = candidate.next(); |
| 212 | } |
| 213 | } |
| 214 | return hasMoreCandidates; |
| 215 | } |
| 216 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 217 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 218 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 219 | * Tells if the target and candidate spans are not too far from |
| 220 | * each other |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 221 | * (within the maximum distance). |
| 222 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 223 | * @return <code>true</code> if the target and candidate spans are |
| 224 | * within |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 225 | * the maximum distance, <code>false</code> otherwise. |
| 226 | * */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 227 | protected boolean isWithinMaxDistance (CandidateSpan target, |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 228 | CandidateSpan candidate) { |
| 229 | int candidatePos = candidate.getPosition(); |
| 230 | int targetPos = target.getPosition(); |
| 231 | |
| 232 | // left candidate |
| 233 | if (candidatePos < targetPos && candidatePos + maxDistance < targetPos) { |
| 234 | return false; |
| 235 | } |
| 236 | // right candidate |
| 237 | if (candidatePos > targetPos && targetPos + maxDistance < candidatePos) { |
| 238 | return false; |
| 239 | } |
| 240 | return true; |
| 241 | } |
| 242 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 243 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 244 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 245 | protected List<CandidateSpan> findMatches (CandidateSpan target, |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 246 | List<CandidateSpan> candidateList) { |
| 247 | |
| 248 | List<CandidateSpan> matches = new ArrayList<>(); |
| 249 | |
| 250 | int actualDistance; |
| 251 | int targetPos = target.getPosition(); |
| 252 | |
| 253 | for (CandidateSpan cs : candidateList) { |
| 254 | actualDistance = Math.abs(targetPos - cs.getPosition()); |
| 255 | |
| 256 | if (minDistance == 0 && actualDistance == 0) { |
| 257 | matches.add(createMatchCandidate(target, cs, true)); |
| 258 | continue; |
| 259 | } |
| 260 | |
| 261 | if (minDistance <= actualDistance && actualDistance <= maxDistance) |
| 262 | matches.add(createMatchCandidate(target, cs, false)); |
| 263 | } |
| 264 | return matches; |
| 265 | } |
| 266 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 267 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 268 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 269 | protected void updateList (List<CandidateSpan> candidateList) { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 270 | updateElementList(candidateList.get(0).getPosition()); |
| 271 | candidateList.remove(0); |
| 272 | } |
| 273 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 274 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 275 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 276 | * Reduces the number of elements kept in the element list by |
| 277 | * removing the |
| 278 | * elements whose position is smaller than or identical to the |
| 279 | * position of |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 280 | * the last target span. |
| 281 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 282 | * @param position |
| 283 | * the last target span position |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 284 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 285 | private void updateElementList (int position) { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 286 | Iterator<CandidateSpan> i = elementList.iterator(); |
| 287 | CandidateSpan e; |
| 288 | while (i.hasNext()) { |
| 289 | e = i.next(); |
| 290 | if (e.getPosition() <= position) { |
| 291 | i.remove(); |
| 292 | } |
| 293 | break; |
| 294 | } |
| 295 | } |
| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 296 | } |