blob: 52b657e7661eaa5397c48f43cc6ea39e3f7f4c0e [file] [log] [blame]
Eliza Margaretha9d1ebeb2014-08-12 11:42:58 +00001package de.ids_mannheim.korap.query.spans;
2
3import java.io.IOException;
4import java.util.Map;
Akronf796b862016-04-29 18:51:25 +02005import java.util.ArrayList;
margaretha3865e522016-05-02 13:24:51 +02006import java.util.PriorityQueue;
Eliza Margaretha9d1ebeb2014-08-12 11:42:58 +00007
Akron700c1eb2015-09-25 16:57:30 +02008import org.apache.lucene.index.LeafReaderContext;
Eliza Margaretha9d1ebeb2014-08-12 11:42:58 +00009import org.apache.lucene.index.Term;
10import org.apache.lucene.index.TermContext;
11import org.apache.lucene.util.Bits;
12
Nils Diewald5380aa62014-09-01 13:21:07 +000013import de.ids_mannheim.korap.query.SpanSubspanQuery;
Eliza Margaretha9d1ebeb2014-08-12 11:42:58 +000014
Akronf796b862016-04-29 18:51:25 +020015import org.slf4j.Logger;
16import org.slf4j.LoggerFactory;
17
Akron9c04ce22016-05-02 16:03:21 +020018// Todo: Sort candidate spans only for negative start offsets!
19
Eliza Margaretha7612bde2015-01-14 10:28:42 +000020/**
Eliza Margaretha58ee0bf2015-01-26 16:37:31 +000021 * Enumeration of SubSpans, which are parts of another Spans. The
22 * SubSpans are specified with a start offset relative to the original
23 * span and a length. If the length is unspecified or 0, the end
24 * position of the subspans is the same as that of the original spans.
Eliza Margaretha7612bde2015-01-14 10:28:42 +000025 *
26 * @author margaretha
Akronf796b862016-04-29 18:51:25 +020027 * @author diewald
Eliza Margaretha7612bde2015-01-14 10:28:42 +000028 *
29 */
30public class SubSpans extends SimpleSpans {
Eliza Margaretha9d1ebeb2014-08-12 11:42:58 +000031
Akronf796b862016-04-29 18:51:25 +020032 // Logger
33 private final Logger log = LoggerFactory.getLogger(SubSpans.class);
Eliza Margaretha9d1ebeb2014-08-12 11:42:58 +000034
Akronf796b862016-04-29 18:51:25 +020035 // This advices the java compiler to ignore all loggings
36 public static final boolean DEBUG = false;
37
38 private int startOffset, length;
margaretha3865e522016-05-02 13:24:51 +020039 private int windowSize;
40 private int currentDoc;
41 private int prevStart;
42 private int prevDoc;
43 private PriorityQueue<CandidateSpan> candidates;
44 private CandidateSpanComparator comparator;
Nils Diewaldbb33da22015-03-04 16:24:25 +000045
Eliza Margaretha7612bde2015-01-14 10:28:42 +000046 /**
Eliza Margaretha58ee0bf2015-01-26 16:37:31 +000047 * Constructs SubSpans for the given {@link SpanSubspanQuery}
48 * specifiying the start offset and the length of the subspans.
Eliza Margaretha7612bde2015-01-14 10:28:42 +000049 *
Eliza Margaretha58ee0bf2015-01-26 16:37:31 +000050 * @param subspanQuery
51 * a SpanSubspanQuery
Eliza Margaretha7612bde2015-01-14 10:28:42 +000052 * @param context
53 * @param acceptDocs
54 * @param termContexts
55 * @throws IOException
56 */
Akron42993552016-02-04 13:24:24 +010057 public SubSpans (SpanSubspanQuery subspanQuery, LeafReaderContext context,
58 Bits acceptDocs, Map<Term, TermContext> termContexts)
59 throws IOException {
Eliza Margaretha7612bde2015-01-14 10:28:42 +000060 super(subspanQuery, context, acceptDocs, termContexts);
61 this.startOffset = subspanQuery.getStartOffset();
62 this.length = subspanQuery.getLength();
Akronf796b862016-04-29 18:51:25 +020063 this.matchPayload = new ArrayList<byte[]>(6);
margaretha3865e522016-05-02 13:24:51 +020064 this.windowSize = subspanQuery.getWindowSize();
65 candidates = new PriorityQueue<>(windowSize, comparator);
Akronf796b862016-04-29 18:51:25 +020066
67 if (DEBUG) {
68 log.trace("Init SubSpan at {} with length {}", this.startOffset, this.length);
69 };
Eliza Margaretha7612bde2015-01-14 10:28:42 +000070 hasMoreSpans = firstSpans.next();
71 }
Eliza Margaretha9d1ebeb2014-08-12 11:42:58 +000072
Nils Diewaldbb33da22015-03-04 16:24:25 +000073
Eliza Margaretha7612bde2015-01-14 10:28:42 +000074 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +000075 public boolean next () throws IOException {
Eliza Margaretha7612bde2015-01-14 10:28:42 +000076 isStartEnumeration = false;
77 return advance();
78 }
Eliza Margaretha9d1ebeb2014-08-12 11:42:58 +000079
Nils Diewaldbb33da22015-03-04 16:24:25 +000080
Eliza Margaretha7612bde2015-01-14 10:28:42 +000081 /**
82 * Advances the SubSpans to the next match.
83 *
Eliza Margaretha58ee0bf2015-01-26 16:37:31 +000084 * @return <code>true</code> if a match is found,
85 * <code>false</code> otherwise.
Eliza Margaretha7612bde2015-01-14 10:28:42 +000086 * @throws IOException
87 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000088 private boolean advance () throws IOException {
margaretha3865e522016-05-02 13:24:51 +020089 while (hasMoreSpans || candidates.size() > 0) {
90 CandidateSpan cs = new CandidateSpan(firstSpans);
91 if (startOffset > 0) {
92 if (findMatch(cs)) {
93 setMatch(cs);
94 hasMoreSpans = firstSpans.next();
95 return true;
96 }
Eliza Margarethaafe98122015-01-23 17:37:57 +000097 hasMoreSpans = firstSpans.next();
margaretha3865e522016-05-02 13:24:51 +020098 }
99 else if (candidates.isEmpty()) {
100 currentDoc = firstSpans.doc();
101 collectCandidates();
102 }
103 else {
104 setMatch(candidates.poll());
105 collectCandidates();
Eliza Margarethaafe98122015-01-23 17:37:57 +0000106 return true;
107 }
Eliza Margaretha7612bde2015-01-14 10:28:42 +0000108 }
109 return false;
110 }
111
margaretha3865e522016-05-02 13:24:51 +0200112 private void collectCandidates() throws IOException {
113
114 while (hasMoreSpans && candidates.size() < windowSize
115 && firstSpans.doc() == currentDoc) {
116 CandidateSpan cs;
117 if (findMatch(cs = new CandidateSpan(firstSpans))) {
118 if (cs.getDoc() == prevDoc && cs.getStart() < prevStart) {
119 log.warn("Span (" + cs.getStart() + ", " + cs.getEnd()
120 + ") is out of order and skipped.");
121 }
122 else {
123 candidates.add(cs);
124 }
125 }
126 hasMoreSpans = firstSpans.next();
127 }
128 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000129
Eliza Margaretha7612bde2015-01-14 10:28:42 +0000130 /**
131 * Sets the properties of the current match/subspan.
132 *
133 * @throws IOException
134 */
margaretha3865e522016-05-02 13:24:51 +0200135 public boolean findMatch(CandidateSpan cs) throws IOException {
Akronf796b862016-04-29 18:51:25 +0200136
137 // Check at span ending
Eliza Margarethaafe98122015-01-23 17:37:57 +0000138 if (this.startOffset < 0) {
margaretha3865e522016-05-02 13:24:51 +0200139 cs.setStart(firstSpans.end() + startOffset);
140 if (cs.getStart() < firstSpans.start()) {
141 cs.setStart(firstSpans.start());
Akronf796b862016-04-29 18:51:25 +0200142 };
Eliza Margarethaafe98122015-01-23 17:37:57 +0000143 }
Akronf796b862016-04-29 18:51:25 +0200144 // Check at span beginning
Eliza Margarethaafe98122015-01-23 17:37:57 +0000145 else {
margaretha3865e522016-05-02 13:24:51 +0200146 cs.setStart(firstSpans.start() + startOffset);
147 if (cs.getStart() >= firstSpans.end()) {
Eliza Margarethaafe98122015-01-23 17:37:57 +0000148 return false;
149 }
150 }
Eliza Margaretha7612bde2015-01-14 10:28:42 +0000151
Akronf796b862016-04-29 18:51:25 +0200152 // Find end position of span
Eliza Margaretha58ee0bf2015-01-26 16:37:31 +0000153 if (this.length > 0) {
margaretha3865e522016-05-02 13:24:51 +0200154 cs.setEnd(cs.getStart() + this.length);
155 if (cs.getEnd() > firstSpans.end()) {
156 cs.setEnd(firstSpans.end());
Eliza Margaretha58ee0bf2015-01-26 16:37:31 +0000157 }
158 }
159 else {
margaretha3865e522016-05-02 13:24:51 +0200160 cs.setEnd(firstSpans.end());
Eliza Margarethaafe98122015-01-23 17:37:57 +0000161 }
Akronf796b862016-04-29 18:51:25 +0200162
Akron9c04ce22016-05-02 16:03:21 +0200163 // Claer payloads of candidatespan
164 cs.getPayloads().clear();
Akronf796b862016-04-29 18:51:25 +0200165
166 // Remove element payloads
167 for (byte[] payload : firstSpans.getPayload()) {
Akrondecc67e2016-04-29 19:16:06 +0200168 if ((payload[0] & ((byte) 64)) != 0) {
Akronf796b862016-04-29 18:51:25 +0200169 continue;
170 };
margaretha3865e522016-05-02 13:24:51 +0200171 cs.getPayloads().add(payload.clone());
Akronf796b862016-04-29 18:51:25 +0200172 };
173
margaretha3865e522016-05-02 13:24:51 +0200174 cs.setDoc(firstSpans.doc());
Akronf796b862016-04-29 18:51:25 +0200175
176 if (DEBUG) {
177 log.trace("Start at absolute position {} " +
178 "and end at absolute position {}",
margaretha3865e522016-05-02 13:24:51 +0200179 cs.getStart(),
180 cs.getEnd());
Akronf796b862016-04-29 18:51:25 +0200181 };
182
Eliza Margarethaafe98122015-01-23 17:37:57 +0000183 return true;
Eliza Margaretha7612bde2015-01-14 10:28:42 +0000184 }
185
margaretha3865e522016-05-02 13:24:51 +0200186 private void setMatch(CandidateSpan cs) {
187 matchStartPosition = cs.getStart();
188 prevStart = matchStartPosition;
189 matchEndPosition = cs.getEnd();
190 matchDocNumber = cs.getDoc();
191 prevDoc = matchDocNumber;
192 matchPayload.clear();
193 matchPayload.addAll(cs.getPayloads());
194 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000195
Eliza Margaretha7612bde2015-01-14 10:28:42 +0000196 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000197 public boolean skipTo (int target) throws IOException {
margaretha3865e522016-05-02 13:24:51 +0200198 if (candidates.size() > 0) {
199 CandidateSpan cs;
200 while ((cs = candidates.poll()) != null) {
201 if (cs.getDoc() == target) {
202 return next();
203 }
Eliza Margaretha7612bde2015-01-14 10:28:42 +0000204 }
205 }
margaretha3865e522016-05-02 13:24:51 +0200206 if (firstSpans.doc() == target) {
207 return next();
208 }
209 if (firstSpans.doc() < target && firstSpans.skipTo(target)) {
210 return next();
211 }
Eliza Margaretha7612bde2015-01-14 10:28:42 +0000212 return advance();
213 }
214
Nils Diewaldbb33da22015-03-04 16:24:25 +0000215
Eliza Margaretha7612bde2015-01-14 10:28:42 +0000216 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000217 public long cost () {
Eliza Margaretha7612bde2015-01-14 10:28:42 +0000218 return firstSpans.cost() + 1;
219 }
Eliza Margaretha9d1ebeb2014-08-12 11:42:58 +0000220
221}