blob: 9a1d578122fd71405db54344d8864aee1ca28d54 [file] [log] [blame]
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +00001package de.ids_mannheim.korap.query.spans;
2
3import java.io.IOException;
Eliza Margaretha0170b882014-10-29 15:49:31 +00004import java.nio.ByteBuffer;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +00005import java.util.ArrayList;
6import java.util.Collections;
7import java.util.List;
8import java.util.Map;
9
10import org.apache.lucene.index.AtomicReaderContext;
11import org.apache.lucene.index.Term;
12import org.apache.lucene.index.TermContext;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000013import org.apache.lucene.search.spans.Spans;
14import org.apache.lucene.search.spans.TermSpans;
15import org.apache.lucene.util.Bits;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000016import org.slf4j.Logger;
17import org.slf4j.LoggerFactory;
18
19import de.ids_mannheim.korap.query.SpanRelationQuery;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000020
Eliza Margaretha493bfa92015-01-13 16:16:38 +000021/**
22 * Enumeration of spans denoting relations between two tokens/elements. The
23 * start and end of a RelationSpan always denote the start and end of the
24 * left-side token/element.
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000025 *
Eliza Margaretha493bfa92015-01-13 16:16:38 +000026 * There are 4 types of relations, which is differentiated by the payload length
27 * in bytes.
28 * <ol>
29 * <li>Token to token relation (1 int & 3 short, length: 10)</li>
30 * <li>Token to span (2 int & 3 short, length: 14)</li>
31 * <li>Span to token (int, byte, int, 3 short, length: 15)</li>
32 * <li>Span to Span (3 int & 3 short, length: 18)</li>
33 * </ol>
34 * Every integer value denotes the start/end position of the start/target of a
35 * relation, in this format: (sourceEndPos?, startTargetPos, endTargetPos?). The
36 * end position of a token is identical to its start position, and therefore not
37 * is saved in a payload.
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000038 *
Eliza Margaretha493bfa92015-01-13 16:16:38 +000039 * The short values denote the relation id, left id, and right id. The byte in
40 * relation #3 is just a dummy to create a different length from the relation
41 * #2.
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000042 *
Eliza Margaretha493bfa92015-01-13 16:16:38 +000043 * NOTE: Sorting of the candidate spans can alternatively be done in indexing,
44 * instead of here. (first by left positions and then by right positions)
45 *
46 * @author margaretha
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000047 * */
Eliza Margaretha493bfa92015-01-13 16:16:38 +000048public class RelationSpans extends RelationBaseSpans {
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000049
Eliza Margaretha493bfa92015-01-13 16:16:38 +000050 private int currentDoc, currentPosition;
51 private TermSpans relationTermSpan;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000052
Eliza Margaretha493bfa92015-01-13 16:16:38 +000053 protected Logger logger = LoggerFactory.getLogger(RelationSpans.class);
54 private List<CandidateRelationSpan> candidateList;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000055
Eliza Margaretha493bfa92015-01-13 16:16:38 +000056 /**
Eliza Margaretha7612bde2015-01-14 10:28:42 +000057 * Constructs RelationSpans from the given {@link SpanRelationQuery}.
Eliza Margaretha493bfa92015-01-13 16:16:38 +000058 *
59 * @param relationSpanQuery a SpanRelationQuery
60 * @param context
61 * @param acceptDocs
62 * @param termContexts
63 * @throws IOException
64 */
65 public RelationSpans(SpanRelationQuery relationSpanQuery,
66 AtomicReaderContext context, Bits acceptDocs,
67 Map<Term, TermContext> termContexts) throws IOException {
68 super(relationSpanQuery, context, acceptDocs, termContexts);
69 candidateList = new ArrayList<>();
70 relationTermSpan = (TermSpans) firstSpans;
71 hasMoreSpans = relationTermSpan.next();
72 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000073
Eliza Margaretha493bfa92015-01-13 16:16:38 +000074 @Override
75 public boolean next() throws IOException {
76 isStartEnumeration = false;
77 return advance();
78 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000079
Eliza Margaretha493bfa92015-01-13 16:16:38 +000080 /**
81 * Returns true if there is a next match by checking if the CandidateList is
82 * not empty and set the first element of the list as the next match.
83 * Otherwise, if the RelationSpan has not ended yet, try to set the
84 * CandidateList.
85 *
86 * @return true if there is a next match.
87 * @throws IOException
88 */
89 private boolean advance() throws IOException {
90 while (hasMoreSpans || !candidateList.isEmpty()) {
91 if (!candidateList.isEmpty()) {
92 CandidateRelationSpan cs = candidateList.get(0);
93 this.matchDocNumber = cs.getDoc();
94 this.matchStartPosition = cs.getStart();
95 this.matchEndPosition = cs.getEnd();
Eliza Margaretha2db5e232015-03-04 10:20:01 +000096 this.matchPayload = cs.getPayloads();
Eliza Margaretha493bfa92015-01-13 16:16:38 +000097 this.setRightStart(cs.getRightStart());
98 this.setRightEnd(cs.getRightEnd());
99 this.spanId = cs.getSpanId(); // relation id
100 this.leftId = cs.getLeftId();
101 this.rightId = cs.getRightId();
102 candidateList.remove(0);
103 return true;
Eliza Margaretha2db5e232015-03-04 10:20:01 +0000104 }
105 else {
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000106 setCandidateList();
107 currentDoc = relationTermSpan.doc();
108 currentPosition = relationTermSpan.start();
109 }
110 }
111 return false;
112 }
Eliza Margaretha3e50bc42014-10-22 15:29:15 +0000113
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000114 /**
115 * Setting the CandidateList by adding all relationTermSpan whose start
116 * position is the same as the current span position, and sort the
117 * candidateList.
118 *
119 * @throws IOException
120 */
121 private void setCandidateList() throws IOException {
122 while (hasMoreSpans && relationTermSpan.doc() == currentDoc
123 && relationTermSpan.start() == currentPosition) {
Eliza Margaretha2db5e232015-03-04 10:20:01 +0000124 CandidateRelationSpan cs = new CandidateRelationSpan(relationTermSpan);
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000125 readPayload(cs);
Eliza Margaretha2db5e232015-03-04 10:20:01 +0000126 setPayload(cs);
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000127 candidateList.add(cs);
128 hasMoreSpans = relationTermSpan.next();
129 }
130 Collections.sort(candidateList);
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000131 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000132
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000133 /**
134 * Identify the relation type of the given {@link CandidateRelationSpan} by
135 * checking the length of its payloads, and set some properties of the span
136 * based on the payloads.
137 *
138 * @param cs a CandidateRelationSpan
139 */
140 private void readPayload(CandidateRelationSpan cs) {
141 List<byte[]> payload = (List<byte[]>) cs.getPayloads();
142 int length = payload.get(0).length;
143 ByteBuffer bb = ByteBuffer.allocate(length);
144 bb.put(payload.get(0));
Eliza Margarethad12cabb2014-10-27 17:45:34 +0000145
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000146 int i;
147 switch (length) {
148 case 10: // Token to token
149 i = bb.getInt(0);
150 cs.setRightStart(i - 1);
151 cs.setRightEnd(i);
152 break;
Eliza Margarethad12cabb2014-10-27 17:45:34 +0000153
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000154 case 14: // Token to span
155 cs.setRightStart(bb.getInt(0));
156 cs.setRightEnd(bb.getInt(4));
157 break;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000158
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000159 case 15: // Span to token
160 cs.setEnd(bb.getInt(0));
161 i = bb.getInt(5);
162 cs.setRightStart(i - 1);
163 cs.setRightEnd(i);
164 break;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000165
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000166 case 18: // Span to span
167 cs.setEnd(bb.getInt(0));
168 cs.setRightStart(bb.getInt(4));
169 cs.setRightEnd(bb.getInt(8));
170 break;
171 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000172
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000173 cs.setRightId(bb.getShort(length - 2)); //right id
174 cs.setLeftId(bb.getShort(length - 4)); //left id
175 cs.setSpanId(bb.getShort(length - 6)); //relation id
176 // Payload is cleared.
177 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000178
Eliza Margaretha2db5e232015-03-04 10:20:01 +0000179 private void setPayload(CandidateRelationSpan cs) throws IOException {
180 ArrayList<byte[]> payload = new ArrayList<byte[]>();
181 if (relationTermSpan.isPayloadAvailable()) {
182 payload.addAll(relationTermSpan.getPayload());
183 }
184 payload.add(createClassPayload(cs.getLeftStart(), cs.getLeftEnd(),
185 (byte) 1));
186 payload.add(createClassPayload(cs.getRightStart(), cs.getRightEnd(),
187 (byte) 2));
188 cs.setPayloads(payload);
189 }
190
191 private byte[] createClassPayload(int start, int end, byte classNumber) {
192 ByteBuffer buffer = ByteBuffer.allocate(9);
193 buffer.putInt(start);
194 buffer.putInt(end);
195 buffer.put(classNumber);
196 return buffer.array();
197 }
198
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000199 @Override
200 public boolean skipTo(int target) throws IOException {
201 if (hasMoreSpans && (firstSpans.doc() < target)) {
202 if (!firstSpans.skipTo(target)) {
203 candidateList.clear();
204 return false;
205 }
206 }
207 setCandidateList();
208 matchPayload.clear();
209 isStartEnumeration = false;
210 return advance();
211 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000212
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000213 @Override
214 public long cost() {
215 return firstSpans.cost();
216 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000217
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000218 /**
219 * Returns the right start position of the current RelationSpan.
220 *
221 * @return the right start position of the current RelationSpan.
222 */
223 public int getRightStart() {
224 return rightStart;
225 }
Eliza Margarethad12cabb2014-10-27 17:45:34 +0000226
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000227 /**
228 * Sets the right start position of the current RelationSpan.
229 *
230 * @param rightStart the right start position of the current RelationSpan
231 */
232 public void setRightStart(int rightStart) {
233 this.rightStart = rightStart;
234 }
Eliza Margarethad12cabb2014-10-27 17:45:34 +0000235
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000236 /**
237 * Returns the right end position of the current RelationSpan.
238 *
239 * @return the right end position of the current RelationSpan.
240 */
241 public int getRightEnd() {
242 return rightEnd;
243 }
Eliza Margarethad12cabb2014-10-27 17:45:34 +0000244
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000245 /**
246 * Sets the right end position of the current RelationSpan.
247 *
248 * @param rightEnd the right end position of the current RelationSpan.
249 */
250 public void setRightEnd(int rightEnd) {
251 this.rightEnd = rightEnd;
252 }
Eliza Margarethad12cabb2014-10-27 17:45:34 +0000253
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000254 /**
Eliza Margaretha2db5e232015-03-04 10:20:01 +0000255 * CandidateRelationSpan stores a state of RelationSpans. In a list,
256 * CandidateRelationSpans are ordered first by the position of the relation
257 * left side.
258 */
259 class CandidateRelationSpan extends CandidateSpan {
Eliza Margarethad12cabb2014-10-27 17:45:34 +0000260
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000261 private int rightStart, rightEnd;
262 private short leftId, rightId;
263
264 public CandidateRelationSpan(Spans span) throws IOException {
265 super(span);
266 }
267
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000268 public int getRightEnd() {
269 return rightEnd;
270 }
271
272 public void setRightEnd(int rightEnd) {
273 this.rightEnd = rightEnd;
274 }
275
276 public int getRightStart() {
277 return rightStart;
278 }
279
280 public void setRightStart(int rightStart) {
281 this.rightStart = rightStart;
282 }
283
284 public short getLeftId() {
285 return leftId;
286 }
287
288 public void setLeftId(short leftId) {
289 this.leftId = leftId;
290 }
291
292 public short getRightId() {
293 return rightId;
294 }
295
296 public void setRightId(short rightId) {
297 this.rightId = rightId;
298 }
299
300 }
301
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000302}