blob: 5ee5e7556577a72f240cbbe295c58b671c558588 [file] [log] [blame]
Eliza Margarethafb25cef2014-06-06 14:19:07 +00001package de.ids_mannheim.korap.query.spans;
2
3import java.io.IOException;
4import java.nio.ByteBuffer;
5import java.util.ArrayList;
6import java.util.Collections;
7import java.util.List;
8import java.util.Map;
9
Akron700c1eb2015-09-25 16:57:30 +020010import org.apache.lucene.index.LeafReaderContext;
Eliza Margarethafb25cef2014-06-06 14:19:07 +000011import org.apache.lucene.index.Term;
12import org.apache.lucene.index.TermContext;
13import org.apache.lucene.search.spans.Spans;
Eliza Margaretha8551e5b2014-12-15 16:46:18 +000014import org.apache.lucene.search.spans.TermSpans;
Eliza Margarethafb25cef2014-06-06 14:19:07 +000015import org.apache.lucene.util.Bits;
16import org.slf4j.Logger;
17import org.slf4j.LoggerFactory;
18
Eliza Margarethafb25cef2014-06-06 14:19:07 +000019import de.ids_mannheim.korap.query.SpanAttributeQuery;
20
Nils Diewaldbb33da22015-03-04 16:24:25 +000021/**
Nils Diewaldbb33da22015-03-04 16:24:25 +000022 * Span enumeration of attributes which are term spans with special
Eliza Margaretha95917962016-11-16 16:07:08 +010023 * payload assignments referring to another span (e.g.
24 * element/relation span) to which an attribute span belongs. The
25 * class is basically a wrapper of Lucene {@link TermSpans} with
26 * additional functionality regarding element/relation
Nils Diewaldbb33da22015-03-04 16:24:25 +000027 * reference. Element/relation id is annotated ascendingly starting
Eliza Margaretha95917962016-11-16 16:07:08 +010028 * from the left side.
29 * <br/><br/>
Nils Diewaldbb33da22015-03-04 16:24:25 +000030 * The enumeration is ordered firstly by the start position of the
Eliza Margaretha95917962016-11-16 16:07:08 +010031 * attribute and secondly by the element/relation id descendingly.
32 * This order helps to match element and attributes faster.
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +000033 *
Eliza Margaretha95917962016-11-16 16:07:08 +010034 * AttributeSpans have the same start and end positions of the
35 * element/relations they belongs to, thus querying them alone
36 * is sufficient to get "any element having a specific
37 * attribute".
Eliza Margaretha493bfa92015-01-13 16:16:38 +000038 *
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +000039 * @author margaretha
Eliza Margaretha6f989202016-10-14 21:48:29 +020040 */
margaretha50c76332015-03-19 10:10:39 +010041public class AttributeSpans extends SimpleSpans {
Nils Diewald1455e1e2014-08-01 16:12:43 +000042
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +000043 private List<CandidateAttributeSpan> candidateList;
44 private int currentDoc, currentPosition;
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +000045 private boolean isFinish;
Eliza Margarethafb25cef2014-06-06 14:19:07 +000046
Akron42993552016-02-04 13:24:24 +010047 public static enum PayloadTypeIdentifier {
48 TERM_ATTRIBUTE(16), ELEMENT_ATTRIBUTE(17), RELATION_ATTRIBUTE(18);
margaretha14f918d2015-12-11 11:48:07 +010049
Akron42993552016-02-04 13:24:24 +010050 private int value;
margaretha14f918d2015-12-11 11:48:07 +010051
Akron42993552016-02-04 13:24:24 +010052
53 private PayloadTypeIdentifier (int value) {
54 this.value = value;
55 }
Eliza Margaretha95917962016-11-16 16:07:08 +010056
57
58 public int getValue () {
59 return value;
60 }
Akron42993552016-02-04 13:24:24 +010061 }
margaretha14f918d2015-12-11 11:48:07 +010062
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +000063 protected Logger logger = LoggerFactory.getLogger(AttributeSpans.class);
Eliza Margarethafb25cef2014-06-06 14:19:07 +000064
Nils Diewaldbb33da22015-03-04 16:24:25 +000065
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +000066 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +000067 * Constructs Attributespans based on the specified
68 * SpanAttributeQuery.
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +000069 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000070 * @param spanAttributeQuery
71 * a spanAttributeQuery
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +000072 * @param context
73 * @param acceptDocs
74 * @param termContexts
75 * @throws IOException
76 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000077 public AttributeSpans (SpanAttributeQuery spanAttributeQuery,
Akron700c1eb2015-09-25 16:57:30 +020078 LeafReaderContext context, Bits acceptDocs,
Nils Diewaldbb33da22015-03-04 16:24:25 +000079 Map<Term, TermContext> termContexts)
80 throws IOException {
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +000081 super(spanAttributeQuery, context, acceptDocs, termContexts);
margaretha50c76332015-03-19 10:10:39 +010082 this.hasSpanId = true;
83
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +000084 candidateList = new ArrayList<>();
85 hasMoreSpans = firstSpans.next();
86 if (hasMoreSpans) {
87 currentDoc = firstSpans.doc();
88 currentPosition = firstSpans.start();
89 }
90 }
Eliza Margarethac7fb7312014-07-25 14:11:36 +000091
Nils Diewaldbb33da22015-03-04 16:24:25 +000092
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +000093 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +000094 public boolean next () throws IOException {
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +000095 isStartEnumeration = false;
96 matchPayload.clear();
97 return advance();
98 }
Eliza Margarethafb25cef2014-06-06 14:19:07 +000099
Nils Diewaldbb33da22015-03-04 16:24:25 +0000100
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000101 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000102 * Moves to the next match by checking the candidate match list or
Eliza Margaretha95917962016-11-16 16:07:08 +0100103 * setting the list first when it is empty.
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000104 *
105 * @return true if a match is found
106 * @throws IOException
107 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000108 private boolean advance () throws IOException {
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000109 while (hasMoreSpans || !candidateList.isEmpty()) {
110 if (!candidateList.isEmpty()) {
111 // set the current match from the first CandidateAttributeSpan
112 // in the candidate list
113 CandidateAttributeSpan cs = candidateList.get(0);
114 this.matchDocNumber = cs.getDoc();
115 this.matchStartPosition = cs.getStart();
116 this.matchEndPosition = cs.getEnd();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000117 this.setSpanId(cs.getSpanId()); // referentId
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000118 candidateList.remove(0);
119 return true;
Nils Diewaldbb33da22015-03-04 16:24:25 +0000120 }
121 else {
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000122 setCandidateList();
123 currentDoc = firstSpans.doc();
124 currentPosition = firstSpans.start();
125 }
126 }
127 return false;
128 }
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000129
Nils Diewaldbb33da22015-03-04 16:24:25 +0000130
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000131 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000132 * Collects all the attributes in the same start position and sort
Eliza Margaretha95917962016-11-16 16:07:08 +0100133 * them by element/relation Id in a reverse order (the ones with
134 * the
135 * bigger element/relation Id first).
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000136 *
137 * @throws IOException
138 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000139 private void setCandidateList () throws IOException {
Eliza Margaretha38a94662014-11-20 13:48:00 +0000140
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000141 while (hasMoreSpans && firstSpans.doc() == currentDoc
142 && firstSpans.start() == currentPosition) {
Eliza Margaretha38a94662014-11-20 13:48:00 +0000143
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000144 candidateList.add(createCandidateSpan());
145 hasMoreSpans = firstSpans.next();
146 }
Eliza Margaretha997ccde2014-07-04 09:20:35 +0000147
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000148 Collections.sort(candidateList);
149 Collections.reverse(candidateList);
150 }
Eliza Margaretha997ccde2014-07-04 09:20:35 +0000151
Nils Diewaldbb33da22015-03-04 16:24:25 +0000152
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000153 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000154 * Creates a CandidateAttributeSpan based on the child span and
Eliza Margaretha95917962016-11-16 16:07:08 +0100155 * set the spanId and elementEnd from its payloads.
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000156 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000157 * @param firstSpans
158 * an AttributeSpans
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000159 * @return a CandidateAttributeSpan
160 * @throws IOException
161 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000162 private CandidateAttributeSpan createCandidateSpan () throws IOException {
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000163 List<byte[]> payload = (List<byte[]>) firstSpans.getPayload();
Akron42993552016-02-04 13:24:24 +0100164 ByteBuffer payloadBuffer = ByteBuffer.wrap(payload.get(0));
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000165
Akron42993552016-02-04 13:24:24 +0100166 byte payloadTypeIdentifier = payloadBuffer.get(0);
167 short spanId = payloadBuffer.getShort(5);
Akron42993552016-02-04 13:24:24 +0100168 int end = payloadBuffer.getInt(1);
Eliza Margaretha2db5e232015-03-04 10:20:01 +0000169
Akron42993552016-02-04 13:24:24 +0100170 return new CandidateAttributeSpan(firstSpans, payloadTypeIdentifier,
171 spanId, end);
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000172 }
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000173
Nils Diewaldbb33da22015-03-04 16:24:25 +0000174
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000175 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000176 * Tells if the enumeration of the AttributeSpans has come to an
177 * end.
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000178 *
179 * @return true if the enumeration has finished.
180 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000181 public boolean isFinish () {
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000182 return isFinish;
183 }
184
Nils Diewaldbb33da22015-03-04 16:24:25 +0000185
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000186 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000187 * Sets true if the enumeration of the AttributeSpans has come to
188 * an end.
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000189 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000190 * @param isFinish
191 * <code>true</code> if the enumeration of the
192 * AttributeSpans has come to an end,
193 * <code>false</code> otherwise.
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000194 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000195 public void setFinish (boolean isFinish) {
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000196 this.isFinish = isFinish;
197 }
198
Nils Diewaldbb33da22015-03-04 16:24:25 +0000199
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000200 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000201 public boolean skipTo (int target) throws IOException {
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000202 if (hasMoreSpans && (firstSpans.doc() < target)) {
203 if (!firstSpans.skipTo(target)) {
204 candidateList.clear();
205 return false;
206 }
207 }
208 setCandidateList();
209 matchPayload.clear();
210 isStartEnumeration = false;
211 return advance();
212 }
213
Nils Diewaldbb33da22015-03-04 16:24:25 +0000214
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000215 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000216 public long cost () {
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000217 return firstSpans.cost();
218 }
219
220 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000221 * CandidateAttributeSpan contains information about an Attribute
Eliza Margaretha95917962016-11-16 16:07:08 +0100222 * span. All attribute spans occurring in an identical position
223 * are collected as CandidateAttributeSpans. The list of these
224 * CandidateAttributeSpans are sorted based on the span ids to
225 * which the attributes belong to. The attributes with smaller
226 * spanIds come first on the list.
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000227 *
Eliza Margaretha6f989202016-10-14 21:48:29 +0200228 */
229 class CandidateAttributeSpan extends CandidateSpan
230 implements Comparable<CandidateSpan> {
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000231
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000232 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000233 * Construct a CandidateAttributeSpan based on the given span,
234 * spanId,
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000235 * and elementEnd.
236 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000237 * @param span
238 * an AttributeSpans
239 * @param spanId
240 * the element or relation span id to which the
241 * current
242 * state of the specified AttributeSpans belongs
243 * to.
244 * @param elementEnd
245 * the end position of the element or relation span
246 * to
247 * which the current state of the specified
248 * AttributeSpans
249 * belongs to.
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000250 * @throws IOException
251 */
Akron42993552016-02-04 13:24:24 +0100252 public CandidateAttributeSpan (Spans span, byte payloadTypeIdenfitier,
253 short spanId, int elementEnd)
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000254 throws IOException {
255 super(span);
Akron42993552016-02-04 13:24:24 +0100256 this.spanId = spanId;
Nils Diewaldbb33da22015-03-04 16:24:25 +0000257 this.end = elementEnd;
Akron42993552016-02-04 13:24:24 +0100258 this.payloadTypeIdentifier = payloadTypeIdenfitier;
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000259 }
260
Eliza Margaretha2db5e232015-03-04 10:20:01 +0000261
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000262 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000263 public int compareTo (CandidateSpan o) {
Eliza Margaretha6a2e80b2014-12-02 17:03:23 +0000264 CandidateAttributeSpan cs = (CandidateAttributeSpan) o;
265 if (this.spanId == cs.spanId)
266 return 0;
267 else if (this.spanId > cs.spanId)
268 return 1;
269 return -1;
270 }
271 }
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000272}