| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| 4 | import java.util.ArrayList; |
| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 5 | import java.util.List; |
| 6 | import java.util.Map; |
| 7 | |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 8 | import org.apache.lucene.index.LeafReaderContext; |
| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 9 | import org.apache.lucene.index.Term; |
| 10 | import org.apache.lucene.index.TermContext; |
| 11 | import org.apache.lucene.search.spans.Spans; |
| 12 | import org.apache.lucene.util.Bits; |
| 13 | |
| 14 | import de.ids_mannheim.korap.query.SpanDistanceQuery; |
| 15 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 16 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 17 | * Enumeration of span matches, whose two child spans have a specific |
| 18 | * range of |
| 19 | * distance (within a min and a max distance) and can be in any order. |
| 20 | * The unit |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 21 | * distance is a token position. |
| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 22 | * |
| 23 | * @author margaretha |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 24 | */ |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 25 | public class UnorderedTokenDistanceSpans extends UnorderedDistanceSpans { |
| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 26 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 27 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 28 | * Constructs UnorderedTokenDistanceSpans for the given |
| 29 | * SpanDistanceQuery. |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 30 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 31 | * @param query |
| 32 | * a SpanDistanceQuery |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 33 | * @param context |
| 34 | * @param acceptDocs |
| 35 | * @param termContexts |
| 36 | * @throws IOException |
| 37 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 38 | public UnorderedTokenDistanceSpans (SpanDistanceQuery query, |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 39 | LeafReaderContext context, |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 40 | Bits acceptDocs, |
| 41 | Map<Term, TermContext> termContexts) |
| 42 | throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 43 | super(query, context, acceptDocs, termContexts); |
| 44 | } |
| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 45 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 46 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 47 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 48 | protected boolean prepareLists () throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 49 | |
| 50 | if (firstSpanList.isEmpty() && secondSpanList.isEmpty()) { |
| 51 | if (hasMoreFirstSpans && hasMoreSecondSpans |
| 52 | && ensureSameDoc(firstSpans, secondSpans)) { |
| 53 | firstSpanList.add(new CandidateSpan(firstSpans)); |
| 54 | secondSpanList.add(new CandidateSpan(secondSpans)); |
| 55 | currentDocNum = firstSpans.doc(); |
| 56 | hasMoreFirstSpans = firstSpans.next(); |
| 57 | hasMoreSecondSpans = secondSpans.next(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 58 | } |
| 59 | else { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 60 | hasMoreSpans = false; |
| 61 | return false; |
| 62 | } |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 63 | } |
| 64 | else if (firstSpanList.isEmpty() && hasMoreFirstSpans |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 65 | && firstSpans.doc() == currentDocNum) { |
| 66 | firstSpanList.add(new CandidateSpan(firstSpans)); |
| 67 | hasMoreFirstSpans = firstSpans.next(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 68 | } |
| 69 | else if (secondSpanList.isEmpty() && hasMoreSecondSpans |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 70 | && secondSpans.doc() == currentDocNum) { |
| 71 | secondSpanList.add(new CandidateSpan(secondSpans)); |
| 72 | hasMoreSecondSpans = secondSpans.next(); |
| 73 | } |
| 74 | return true; |
| 75 | } |
| 76 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 77 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 78 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 79 | protected boolean setCandidateList (List<CandidateSpan> candidateList, |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 80 | Spans candidate, boolean hasMoreCandidates, |
| 81 | List<CandidateSpan> targetList) throws IOException { |
| 82 | |
| 83 | if (!targetList.isEmpty()) { |
| 84 | CandidateSpan target = targetList.get(0); |
| 85 | while (hasMoreCandidates && candidate.doc() == target.getDoc() |
| 86 | && isWithinMaxDistance(target, candidate)) { |
| 87 | candidateList.add(new CandidateSpan(candidate)); |
| 88 | hasMoreCandidates = candidate.next(); |
| 89 | } |
| 90 | } |
| 91 | return hasMoreCandidates; |
| 92 | } |
| 93 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 94 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 95 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 96 | * Tells if the target and candidate spans are not too far from |
| 97 | * each other |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 98 | * (within the maximum distance). |
| 99 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 100 | * @param target |
| 101 | * a target span |
| 102 | * @param candidate |
| 103 | * a candidate span |
| 104 | * @return <code>true</code> if the target and candidate spans are |
| 105 | * within |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 106 | * the maximum distance, <code>false</code> otherwise. |
| 107 | */ |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 108 | protected boolean isWithinMaxDistance (CandidateSpan target, |
| 109 | Spans candidate) { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 110 | // left candidate |
| 111 | if (candidate.end() < target.getStart() |
| 112 | && candidate.end() + maxDistance <= target.getStart()) { |
| 113 | return false; |
| 114 | } |
| 115 | // right candidate |
| 116 | if (candidate.start() > target.getEnd() |
| 117 | && target.getEnd() + maxDistance <= candidate.start()) { |
| 118 | return false; |
| 119 | } |
| 120 | return true; |
| 121 | } |
| 122 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 123 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 124 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 125 | protected List<CandidateSpan> findMatches (CandidateSpan target, |
| margaretha | c66265c | 2016-12-14 13:48:45 +0100 | [diff] [blame] | 126 | List<CandidateSpan> candidateList, boolean isTargetFirstSpan) { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 127 | |
| 128 | List<CandidateSpan> matches = new ArrayList<>(); |
| 129 | int actualDistance; |
| margaretha | 3512087 | 2016-12-19 18:24:22 +0100 | [diff] [blame] | 130 | CandidateSpan match; |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 131 | for (CandidateSpan cs : candidateList) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 132 | if (minDistance == 0 && |
| 133 | // intersection |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 134 | target.getStart() < cs.getEnd() |
| 135 | && cs.getStart() < target.getEnd()) { |
| margaretha | 3512087 | 2016-12-19 18:24:22 +0100 | [diff] [blame] | 136 | match = createMatchCandidate(target, cs, true, isTargetFirstSpan); |
| 137 | matches.add(match); |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 138 | continue; |
| 139 | } |
| 140 | |
| 141 | // left candidate |
| margaretha | 50110f3 | 2015-05-12 18:21:29 +0200 | [diff] [blame] | 142 | if (cs.getEnd() < target.getStart()) { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 143 | actualDistance = target.getStart() - cs.getEnd() + 1; |
| margaretha | 50110f3 | 2015-05-12 18:21:29 +0200 | [diff] [blame] | 144 | } |
| 145 | else { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 146 | // right candidate |
| 147 | actualDistance = cs.getStart() - target.getEnd() + 1; |
| margaretha | 50110f3 | 2015-05-12 18:21:29 +0200 | [diff] [blame] | 148 | } |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 149 | if (minDistance <= actualDistance |
| 150 | && actualDistance <= maxDistance) { |
| margaretha | 3512087 | 2016-12-19 18:24:22 +0100 | [diff] [blame] | 151 | match = createMatchCandidate(target, cs, false, isTargetFirstSpan); |
| 152 | matches.add(match); |
| margaretha | 50110f3 | 2015-05-12 18:21:29 +0200 | [diff] [blame] | 153 | } |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 154 | } |
| 155 | return matches; |
| 156 | } |
| 157 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 158 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 159 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 160 | protected void updateList (List<CandidateSpan> candidateList) { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 161 | candidateList.remove(0); |
| 162 | } |
| Eliza Margaretha | 1413e0f | 2014-02-06 13:01:29 +0000 | [diff] [blame] | 163 | |
| 164 | } |