| Eliza Margaretha | 9d1ebeb | 2014-08-12 11:42:58 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| 4 | import java.util.Map; |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 5 | import java.util.ArrayList; |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 6 | import java.util.PriorityQueue; |
| Eliza Margaretha | 9d1ebeb | 2014-08-12 11:42:58 +0000 | [diff] [blame] | 7 | |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 8 | import org.apache.lucene.index.LeafReaderContext; |
| Eliza Margaretha | 9d1ebeb | 2014-08-12 11:42:58 +0000 | [diff] [blame] | 9 | import org.apache.lucene.index.Term; |
| 10 | import org.apache.lucene.index.TermContext; |
| 11 | import org.apache.lucene.util.Bits; |
| 12 | |
| Nils Diewald | 5380aa6 | 2014-09-01 13:21:07 +0000 | [diff] [blame] | 13 | import de.ids_mannheim.korap.query.SpanSubspanQuery; |
| Eliza Margaretha | 9d1ebeb | 2014-08-12 11:42:58 +0000 | [diff] [blame] | 14 | |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 15 | import org.slf4j.Logger; |
| 16 | import org.slf4j.LoggerFactory; |
| 17 | |
| Akron | 9c04ce2 | 2016-05-02 16:03:21 +0200 | [diff] [blame] | 18 | // Todo: Sort candidate spans only for negative start offsets! |
| 19 | |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 20 | /** |
| Eliza Margaretha | 58ee0bf | 2015-01-26 16:37:31 +0000 | [diff] [blame] | 21 | * Enumeration of SubSpans, which are parts of another Spans. The |
| 22 | * SubSpans are specified with a start offset relative to the original |
| 23 | * span and a length. If the length is unspecified or 0, the end |
| 24 | * position of the subspans is the same as that of the original spans. |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 25 | * |
| 26 | * @author margaretha |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 27 | * @author diewald |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 28 | * |
| 29 | */ |
| 30 | public class SubSpans extends SimpleSpans { |
| Eliza Margaretha | 9d1ebeb | 2014-08-12 11:42:58 +0000 | [diff] [blame] | 31 | |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 32 | // Logger |
| 33 | private final Logger log = LoggerFactory.getLogger(SubSpans.class); |
| Eliza Margaretha | 9d1ebeb | 2014-08-12 11:42:58 +0000 | [diff] [blame] | 34 | |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 35 | // This advices the java compiler to ignore all loggings |
| 36 | public static final boolean DEBUG = false; |
| 37 | |
| 38 | private int startOffset, length; |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 39 | private int windowSize; |
| 40 | private int currentDoc; |
| 41 | private int prevStart; |
| 42 | private int prevDoc; |
| 43 | private PriorityQueue<CandidateSpan> candidates; |
| 44 | private CandidateSpanComparator comparator; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 45 | |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 46 | /** |
| Eliza Margaretha | 58ee0bf | 2015-01-26 16:37:31 +0000 | [diff] [blame] | 47 | * Constructs SubSpans for the given {@link SpanSubspanQuery} |
| 48 | * specifiying the start offset and the length of the subspans. |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 49 | * |
| Eliza Margaretha | 58ee0bf | 2015-01-26 16:37:31 +0000 | [diff] [blame] | 50 | * @param subspanQuery |
| 51 | * a SpanSubspanQuery |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 52 | * @param context |
| 53 | * @param acceptDocs |
| 54 | * @param termContexts |
| 55 | * @throws IOException |
| 56 | */ |
| Akron | 4299355 | 2016-02-04 13:24:24 +0100 | [diff] [blame] | 57 | public SubSpans (SpanSubspanQuery subspanQuery, LeafReaderContext context, |
| 58 | Bits acceptDocs, Map<Term, TermContext> termContexts) |
| 59 | throws IOException { |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 60 | super(subspanQuery, context, acceptDocs, termContexts); |
| 61 | this.startOffset = subspanQuery.getStartOffset(); |
| 62 | this.length = subspanQuery.getLength(); |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 63 | this.matchPayload = new ArrayList<byte[]>(6); |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 64 | this.windowSize = subspanQuery.getWindowSize(); |
| 65 | candidates = new PriorityQueue<>(windowSize, comparator); |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 66 | |
| 67 | if (DEBUG) { |
| 68 | log.trace("Init SubSpan at {} with length {}", this.startOffset, this.length); |
| 69 | }; |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 70 | hasMoreSpans = firstSpans.next(); |
| 71 | } |
| Eliza Margaretha | 9d1ebeb | 2014-08-12 11:42:58 +0000 | [diff] [blame] | 72 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 73 | |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 74 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 75 | public boolean next () throws IOException { |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 76 | isStartEnumeration = false; |
| 77 | return advance(); |
| 78 | } |
| Eliza Margaretha | 9d1ebeb | 2014-08-12 11:42:58 +0000 | [diff] [blame] | 79 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 80 | |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 81 | /** |
| 82 | * Advances the SubSpans to the next match. |
| 83 | * |
| Eliza Margaretha | 58ee0bf | 2015-01-26 16:37:31 +0000 | [diff] [blame] | 84 | * @return <code>true</code> if a match is found, |
| 85 | * <code>false</code> otherwise. |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 86 | * @throws IOException |
| 87 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 88 | private boolean advance () throws IOException { |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 89 | while (hasMoreSpans || candidates.size() > 0) { |
| 90 | CandidateSpan cs = new CandidateSpan(firstSpans); |
| 91 | if (startOffset > 0) { |
| 92 | if (findMatch(cs)) { |
| 93 | setMatch(cs); |
| 94 | hasMoreSpans = firstSpans.next(); |
| 95 | return true; |
| 96 | } |
| Eliza Margaretha | afe9812 | 2015-01-23 17:37:57 +0000 | [diff] [blame] | 97 | hasMoreSpans = firstSpans.next(); |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 98 | } |
| 99 | else if (candidates.isEmpty()) { |
| 100 | currentDoc = firstSpans.doc(); |
| 101 | collectCandidates(); |
| 102 | } |
| 103 | else { |
| 104 | setMatch(candidates.poll()); |
| 105 | collectCandidates(); |
| Eliza Margaretha | afe9812 | 2015-01-23 17:37:57 +0000 | [diff] [blame] | 106 | return true; |
| 107 | } |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 108 | } |
| 109 | return false; |
| 110 | } |
| 111 | |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 112 | private void collectCandidates() throws IOException { |
| 113 | |
| 114 | while (hasMoreSpans && candidates.size() < windowSize |
| 115 | && firstSpans.doc() == currentDoc) { |
| 116 | CandidateSpan cs; |
| 117 | if (findMatch(cs = new CandidateSpan(firstSpans))) { |
| 118 | if (cs.getDoc() == prevDoc && cs.getStart() < prevStart) { |
| 119 | log.warn("Span (" + cs.getStart() + ", " + cs.getEnd() |
| 120 | + ") is out of order and skipped."); |
| 121 | } |
| 122 | else { |
| 123 | candidates.add(cs); |
| 124 | } |
| 125 | } |
| 126 | hasMoreSpans = firstSpans.next(); |
| 127 | } |
| 128 | } |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 129 | |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 130 | /** |
| 131 | * Sets the properties of the current match/subspan. |
| 132 | * |
| 133 | * @throws IOException |
| 134 | */ |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 135 | public boolean findMatch(CandidateSpan cs) throws IOException { |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 136 | |
| 137 | // Check at span ending |
| Eliza Margaretha | afe9812 | 2015-01-23 17:37:57 +0000 | [diff] [blame] | 138 | if (this.startOffset < 0) { |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 139 | cs.setStart(firstSpans.end() + startOffset); |
| 140 | if (cs.getStart() < firstSpans.start()) { |
| 141 | cs.setStart(firstSpans.start()); |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 142 | }; |
| Eliza Margaretha | afe9812 | 2015-01-23 17:37:57 +0000 | [diff] [blame] | 143 | } |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 144 | // Check at span beginning |
| Eliza Margaretha | afe9812 | 2015-01-23 17:37:57 +0000 | [diff] [blame] | 145 | else { |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 146 | cs.setStart(firstSpans.start() + startOffset); |
| 147 | if (cs.getStart() >= firstSpans.end()) { |
| Eliza Margaretha | afe9812 | 2015-01-23 17:37:57 +0000 | [diff] [blame] | 148 | return false; |
| 149 | } |
| 150 | } |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 151 | |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 152 | // Find end position of span |
| Eliza Margaretha | 58ee0bf | 2015-01-26 16:37:31 +0000 | [diff] [blame] | 153 | if (this.length > 0) { |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 154 | cs.setEnd(cs.getStart() + this.length); |
| 155 | if (cs.getEnd() > firstSpans.end()) { |
| 156 | cs.setEnd(firstSpans.end()); |
| Eliza Margaretha | 58ee0bf | 2015-01-26 16:37:31 +0000 | [diff] [blame] | 157 | } |
| 158 | } |
| 159 | else { |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 160 | cs.setEnd(firstSpans.end()); |
| Eliza Margaretha | afe9812 | 2015-01-23 17:37:57 +0000 | [diff] [blame] | 161 | } |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 162 | |
| Akron | 9c04ce2 | 2016-05-02 16:03:21 +0200 | [diff] [blame] | 163 | // Claer payloads of candidatespan |
| 164 | cs.getPayloads().clear(); |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 165 | |
| 166 | // Remove element payloads |
| 167 | for (byte[] payload : firstSpans.getPayload()) { |
| Akron | decc67e | 2016-04-29 19:16:06 +0200 | [diff] [blame] | 168 | if ((payload[0] & ((byte) 64)) != 0) { |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 169 | continue; |
| 170 | }; |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 171 | cs.getPayloads().add(payload.clone()); |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 172 | }; |
| 173 | |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 174 | cs.setDoc(firstSpans.doc()); |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 175 | |
| 176 | if (DEBUG) { |
| 177 | log.trace("Start at absolute position {} " + |
| 178 | "and end at absolute position {}", |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 179 | cs.getStart(), |
| 180 | cs.getEnd()); |
| Akron | f796b86 | 2016-04-29 18:51:25 +0200 | [diff] [blame] | 181 | }; |
| 182 | |
| Eliza Margaretha | afe9812 | 2015-01-23 17:37:57 +0000 | [diff] [blame] | 183 | return true; |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 184 | } |
| 185 | |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 186 | private void setMatch(CandidateSpan cs) { |
| 187 | matchStartPosition = cs.getStart(); |
| 188 | prevStart = matchStartPosition; |
| 189 | matchEndPosition = cs.getEnd(); |
| 190 | matchDocNumber = cs.getDoc(); |
| 191 | prevDoc = matchDocNumber; |
| 192 | matchPayload.clear(); |
| 193 | matchPayload.addAll(cs.getPayloads()); |
| 194 | } |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 195 | |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 196 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 197 | public boolean skipTo (int target) throws IOException { |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 198 | if (candidates.size() > 0) { |
| 199 | CandidateSpan cs; |
| 200 | while ((cs = candidates.poll()) != null) { |
| 201 | if (cs.getDoc() == target) { |
| 202 | return next(); |
| 203 | } |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 204 | } |
| 205 | } |
| margaretha | 3865e52 | 2016-05-02 13:24:51 +0200 | [diff] [blame] | 206 | if (firstSpans.doc() == target) { |
| 207 | return next(); |
| 208 | } |
| 209 | if (firstSpans.doc() < target && firstSpans.skipTo(target)) { |
| 210 | return next(); |
| 211 | } |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 212 | return advance(); |
| 213 | } |
| 214 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 215 | |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 216 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 217 | public long cost () { |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 218 | return firstSpans.cost() + 1; |
| 219 | } |
| Eliza Margaretha | 9d1ebeb | 2014-08-12 11:42:58 +0000 | [diff] [blame] | 220 | |
| 221 | } |