| Eliza Margaretha | 198e4ef | 2014-02-10 13:50:50 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| 4 | import java.util.ArrayList; |
| 5 | import java.util.List; |
| 6 | import java.util.Map; |
| 7 | |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 8 | import org.apache.lucene.index.LeafReaderContext; |
| Eliza Margaretha | 198e4ef | 2014-02-10 13:50:50 +0000 | [diff] [blame] | 9 | import org.apache.lucene.index.Term; |
| 10 | import org.apache.lucene.index.TermContext; |
| 11 | import org.apache.lucene.util.Bits; |
| 12 | |
| 13 | import de.ids_mannheim.korap.query.SpanDistanceQuery; |
| 14 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 15 | /** |
| 16 | * Base class for calculating a distance between two ordered spans. |
| 17 | * |
| 18 | * @author margaretha |
| Eliza Margaretha | 198e4ef | 2014-02-10 13:50:50 +0000 | [diff] [blame] | 19 | * */ |
| 20 | public abstract class OrderedDistanceSpans extends DistanceSpans { |
| 21 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 22 | protected boolean hasMoreFirstSpans; |
| 23 | protected int minDistance, maxDistance; |
| Nils Diewald | 34eaa86 | 2014-06-03 10:56:27 +0000 | [diff] [blame] | 24 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 25 | protected List<CandidateSpan> candidateList; |
| 26 | protected int candidateListIndex; |
| 27 | protected int candidateListDocNum; |
| Nils Diewald | 34eaa86 | 2014-06-03 10:56:27 +0000 | [diff] [blame] | 28 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 29 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 30 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 31 | * Constructs OrderedDistanceSpans based on the given |
| 32 | * SpanDistanceQuery. |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 33 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 34 | * @param query |
| 35 | * a SpanDistanceQuery |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 36 | * @param context |
| 37 | * @param acceptDocs |
| 38 | * @param termContexts |
| 39 | * @throws IOException |
| 40 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 41 | public OrderedDistanceSpans (SpanDistanceQuery query, |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 42 | LeafReaderContext context, Bits acceptDocs, |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 43 | Map<Term, TermContext> termContexts) |
| 44 | throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 45 | super(query, context, acceptDocs, termContexts); |
| Eliza Margaretha | 198e4ef | 2014-02-10 13:50:50 +0000 | [diff] [blame] | 46 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 47 | minDistance = query.getMinDistance(); |
| 48 | maxDistance = query.getMaxDistance(); |
| 49 | |
| 50 | hasMoreFirstSpans = firstSpans.next(); |
| 51 | |
| 52 | candidateList = new ArrayList<>(); |
| 53 | candidateListIndex = -1; |
| 54 | candidateListDocNum = firstSpans.doc(); |
| 55 | } |
| 56 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 57 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 58 | /** |
| 59 | * Finds a span match in the candidate list. |
| 60 | * */ |
| 61 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 62 | protected boolean advance () throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 63 | while (hasMoreSpans && candidateListIndex < candidateList.size()) { |
| 64 | // Check candidates |
| 65 | for (candidateListIndex++; candidateListIndex < candidateList |
| 66 | .size(); candidateListIndex++) { |
| 67 | if (findMatch()) |
| 68 | return true; |
| 69 | } |
| 70 | |
| 71 | do { // Forward secondspan |
| 72 | hasMoreSpans = secondSpans.next(); |
| 73 | setCandidateList(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 74 | } |
| 75 | while (hasMoreSpans && !isSecondSpanValid()); |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 76 | } |
| 77 | return false; |
| 78 | } |
| 79 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 80 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 81 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 82 | * Determines if the current second span is valid (i.e. within an |
| 83 | * element). |
| 84 | * It is always valid in TokenDistanceSpan, but it can be invalid |
| 85 | * in the |
| 86 | * ElementDistanceSpan, namely when it is not within a particular |
| 87 | * element (a |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 88 | * sentence or a paragraph depends on the element distance unit). |
| 89 | * |
| 90 | * @return <code>true</code> of the current second span is valid, |
| 91 | * <code>false</code> otherwise. |
| 92 | * @throws IOException |
| 93 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 94 | protected abstract boolean isSecondSpanValid () throws IOException; |
| 95 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 96 | |
| 97 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 98 | * Stores/collects the states of all possible firstspans as |
| 99 | * candidate spans |
| 100 | * for the current secondspan. The candidate spans must be within |
| 101 | * the |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 102 | * maximum distance from the current secondspan. |
| 103 | * |
| 104 | * @throws IOException |
| 105 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 106 | protected abstract void setCandidateList () throws IOException; |
| 107 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 108 | |
| 109 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 110 | * Defines the conditions for a match and tells if a match is |
| 111 | * found. |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 112 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 113 | * @return <code>true</code> if a match is found, |
| 114 | * <code>false</code> |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 115 | * otherwise. |
| 116 | * @throws IOException |
| 117 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 118 | protected abstract boolean findMatch () throws IOException; |
| 119 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 120 | |
| 121 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 122 | * Defines the properties of a span match. The distance between |
| 123 | * the first |
| 124 | * and the second spans is zero, when there is an intersection |
| 125 | * between them |
| 126 | * in {@link TokenDistanceSpans}, or they occur in the same |
| 127 | * element in {@link ElementDistanceSpans}. |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 128 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 129 | * @param candidateSpan |
| 130 | * a match span |
| 131 | * @param isDistanceZero |
| 132 | * <code>true</code> if the distance between the first |
| 133 | * and the second spans is zero, <code>false</code> |
| 134 | * otherwise. |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 135 | * @throws IOException |
| 136 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 137 | protected void setMatchProperties (CandidateSpan candidateSpan, |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 138 | boolean isDistanceZero) throws IOException { |
| 139 | |
| 140 | setMatchFirstSpan(candidateSpan); |
| 141 | setMatchSecondSpan(new CandidateSpan(secondSpans)); |
| 142 | |
| 143 | if (isDistanceZero) { |
| 144 | matchStartPosition = Math.min(candidateSpan.getStart(), |
| 145 | secondSpans.start()); |
| 146 | matchEndPosition = Math.max(candidateSpan.getEnd(), |
| 147 | secondSpans.end()); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 148 | } |
| 149 | else { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 150 | matchStartPosition = candidateSpan.getStart(); |
| 151 | matchEndPosition = secondSpans.end(); |
| 152 | } |
| 153 | |
| 154 | this.matchDocNumber = secondSpans.doc(); |
| 155 | if (collectPayloads) { |
| 156 | if (candidateSpan.getPayloads() != null) { |
| 157 | matchPayload.addAll(candidateSpan.getPayloads()); |
| 158 | } |
| 159 | if (secondSpans.isPayloadAvailable()) { |
| 160 | matchPayload.addAll(secondSpans.getPayload()); |
| 161 | } |
| 162 | } |
| 163 | } |
| 164 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 165 | |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 166 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 167 | public boolean skipTo (int target) throws IOException { |
| Eliza Margaretha | 609a5be | 2014-12-18 16:52:20 +0000 | [diff] [blame] | 168 | if (hasMoreSpans && (secondSpans.doc() < target)) { |
| 169 | if (!secondSpans.skipTo(target)) { |
| 170 | candidateList.clear(); |
| 171 | return false; |
| 172 | } |
| 173 | } |
| 174 | |
| 175 | setCandidateList(); |
| 176 | matchPayload.clear(); |
| 177 | isStartEnumeration = false; |
| 178 | return advance(); |
| 179 | } |
| Eliza Margaretha | 198e4ef | 2014-02-10 13:50:50 +0000 | [diff] [blame] | 180 | } |