blob: 02d96560c34a844f545db300bcf11cc9b646ecd8 [file] [log] [blame]
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +00001package de.ids_mannheim.korap.query.spans;
2
3import java.io.IOException;
Eliza Margaretha0170b882014-10-29 15:49:31 +00004import java.nio.ByteBuffer;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +00005import java.util.ArrayList;
6import java.util.Collections;
7import java.util.List;
8import java.util.Map;
9
Akron700c1eb2015-09-25 16:57:30 +020010import org.apache.lucene.index.LeafReaderContext;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000011import org.apache.lucene.index.Term;
12import org.apache.lucene.index.TermContext;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000013import org.apache.lucene.search.spans.TermSpans;
14import org.apache.lucene.util.Bits;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000015import org.slf4j.Logger;
16import org.slf4j.LoggerFactory;
17
18import de.ids_mannheim.korap.query.SpanRelationQuery;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000019
Eliza Margaretha493bfa92015-01-13 16:16:38 +000020/**
Nils Diewaldbb33da22015-03-04 16:24:25 +000021 * Enumeration of spans denoting relations between two
margarethaca8d6222015-04-15 13:46:41 +020022 * tokens/elements. The start and end of a RelationSpan always denote
23 * the start and end of the left-side token/element.
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000024 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000025 * There are 4 types of relations, which is differentiated by the
margarethaca8d6222015-04-15 13:46:41 +020026 * payload length in bytes.
Eliza Margaretha493bfa92015-01-13 16:16:38 +000027 * <ol>
28 * <li>Token to token relation (1 int & 3 short, length: 10)</li>
29 * <li>Token to span (2 int & 3 short, length: 14)</li>
30 * <li>Span to token (int, byte, int, 3 short, length: 15)</li>
31 * <li>Span to Span (3 int & 3 short, length: 18)</li>
32 * </ol>
Nils Diewaldbb33da22015-03-04 16:24:25 +000033 * Every integer value denotes the start/end position of the
margarethaca8d6222015-04-15 13:46:41 +020034 * start/target of a relation, in this format: (sourceEndPos?,
35 * startTargetPos, endTargetPos?). The end position of a token is
36 * identical to its start position, and therefore not is saved in a
37 * payload.
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000038 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000039 * The short values denote the relation id, left id, and right id. The
margarethaca8d6222015-04-15 13:46:41 +020040 * byte in relation #3 is just a dummy to create a different length
41 * from the relation #2.
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000042 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000043 * NOTE: Sorting of the candidate spans can alternatively be done in
margarethaca8d6222015-04-15 13:46:41 +020044 * indexing, instead of here. (first by left positions and then by
45 * right positions)
46 *
47 * The class number of relation source is always 1 and that of
48 * relation target is always 2 regardless of the relation direction.
Eliza Margaretha493bfa92015-01-13 16:16:38 +000049 *
50 * @author margaretha
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000051 * */
Eliza Margaretha493bfa92015-01-13 16:16:38 +000052public class RelationSpans extends RelationBaseSpans {
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000053
Eliza Margaretha493bfa92015-01-13 16:16:38 +000054 private int currentDoc, currentPosition;
margarethaca8d6222015-04-15 13:46:41 +020055 private int direction;
Eliza Margaretha493bfa92015-01-13 16:16:38 +000056 private TermSpans relationTermSpan;
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +000057
Eliza Margaretha493bfa92015-01-13 16:16:38 +000058 protected Logger logger = LoggerFactory.getLogger(RelationSpans.class);
margaretha7ee65952015-12-14 15:39:12 +010059 private List<CandidateSpan> candidateList;
margarethaf70addb2015-04-27 13:17:18 +020060 private byte tempSourceNum, tempTargetNum;
61 private byte sourceClass, targetClass;
Nils Diewaldbb33da22015-03-04 16:24:25 +000062
margaretha7ee65952015-12-14 15:39:12 +010063 public static enum PayloadTypeIdentifier {
64 TERM_TO_TERM(32), TERM_TO_ELEMENT(33), ELEMENT_TO_TERM(34), ELEMENT_TO_ELEMENT(
65 35);
66
67 private byte value;
68
69 private PayloadTypeIdentifier (int value) {
70 this.value = (byte) value;
71 }
72 }
Akronbb5d1732015-06-22 01:22:40 +020073
Eliza Margaretha493bfa92015-01-13 16:16:38 +000074 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +000075 * Constructs RelationSpans from the given
76 * {@link SpanRelationQuery}.
Eliza Margaretha493bfa92015-01-13 16:16:38 +000077 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000078 * @param relationSpanQuery
79 * a SpanRelationQuery
Eliza Margaretha493bfa92015-01-13 16:16:38 +000080 * @param context
81 * @param acceptDocs
82 * @param termContexts
83 * @throws IOException
84 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000085 public RelationSpans (SpanRelationQuery relationSpanQuery,
Akron700c1eb2015-09-25 16:57:30 +020086 LeafReaderContext context, Bits acceptDocs,
Nils Diewaldbb33da22015-03-04 16:24:25 +000087 Map<Term, TermContext> termContexts)
88 throws IOException {
Eliza Margaretha493bfa92015-01-13 16:16:38 +000089 super(relationSpanQuery, context, acceptDocs, termContexts);
margarethaca8d6222015-04-15 13:46:41 +020090 direction = relationSpanQuery.getDirection();
margarethaf70addb2015-04-27 13:17:18 +020091 tempSourceNum = relationSpanQuery.getTempSourceNum();
92 tempTargetNum = relationSpanQuery.getTempTargetNum();
93 sourceClass = relationSpanQuery.getSourceClass();
94 targetClass = relationSpanQuery.getTargetClass();
95
Eliza Margaretha493bfa92015-01-13 16:16:38 +000096 candidateList = new ArrayList<>();
97 relationTermSpan = (TermSpans) firstSpans;
98 hasMoreSpans = relationTermSpan.next();
99 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000100
Nils Diewaldbb33da22015-03-04 16:24:25 +0000101
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000102 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000103 public boolean next () throws IOException {
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000104 isStartEnumeration = false;
105 return advance();
106 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000107
Nils Diewaldbb33da22015-03-04 16:24:25 +0000108
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000109 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000110 * Returns true if there is a next match by checking if the
111 * CandidateList is
112 * not empty and set the first element of the list as the next
113 * match.
114 * Otherwise, if the RelationSpan has not ended yet, try to set
115 * the
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000116 * CandidateList.
117 *
118 * @return true if there is a next match.
119 * @throws IOException
120 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000121 private boolean advance () throws IOException {
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000122 while (hasMoreSpans || !candidateList.isEmpty()) {
123 if (!candidateList.isEmpty()) {
margaretha7ee65952015-12-14 15:39:12 +0100124 CandidateSpan cs = candidateList.get(0);
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000125 this.matchDocNumber = cs.getDoc();
126 this.matchStartPosition = cs.getStart();
127 this.matchEndPosition = cs.getEnd();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000128 this.matchPayload = cs.getPayloads();
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000129 this.setRightStart(cs.getRightStart());
130 this.setRightEnd(cs.getRightEnd());
131 this.spanId = cs.getSpanId(); // relation id
132 this.leftId = cs.getLeftId();
133 this.rightId = cs.getRightId();
134 candidateList.remove(0);
135 return true;
Nils Diewaldbb33da22015-03-04 16:24:25 +0000136 }
Eliza Margaretha2db5e232015-03-04 10:20:01 +0000137 else {
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000138 setCandidateList();
139 currentDoc = relationTermSpan.doc();
140 currentPosition = relationTermSpan.start();
141 }
142 }
143 return false;
144 }
Eliza Margaretha3e50bc42014-10-22 15:29:15 +0000145
Nils Diewaldbb33da22015-03-04 16:24:25 +0000146
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000147 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000148 * Setting the CandidateList by adding all relationTermSpan whose
149 * start
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000150 * position is the same as the current span position, and sort the
151 * candidateList.
152 *
153 * @throws IOException
154 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000155 private void setCandidateList () throws IOException {
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000156 while (hasMoreSpans && relationTermSpan.doc() == currentDoc
157 && relationTermSpan.start() == currentPosition) {
margaretha7ee65952015-12-14 15:39:12 +0100158
159 CandidateSpan cs = new CandidateSpan(
Nils Diewaldbb33da22015-03-04 16:24:25 +0000160 relationTermSpan);
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000161 readPayload(cs);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000162 setPayload(cs);
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000163 candidateList.add(cs);
164 hasMoreSpans = relationTermSpan.next();
165 }
166 Collections.sort(candidateList);
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000167 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000168
Nils Diewaldbb33da22015-03-04 16:24:25 +0000169
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000170 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000171 * Identify the relation type of the given
172 * {@link CandidateRelationSpan} by
173 * checking the length of its payloads, and set some properties of
174 * the span
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000175 * based on the payloads.
176 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000177 * @param cs
178 * a CandidateRelationSpan
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000179 */
margaretha7ee65952015-12-14 15:39:12 +0100180 private void readPayload(CandidateSpan cs) {
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000181 List<byte[]> payload = (List<byte[]>) cs.getPayloads();
182 int length = payload.get(0).length;
183 ByteBuffer bb = ByteBuffer.allocate(length);
184 bb.put(payload.get(0));
Eliza Margarethad12cabb2014-10-27 17:45:34 +0000185
margaretha50c76332015-03-19 10:10:39 +0100186 cs.setLeftStart(cs.start);
187
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000188 int i;
margaretha7ee65952015-12-14 15:39:12 +0100189 this.payloadTypeIdentifier = bb.get(0);
190
191 if (payloadTypeIdentifier == PayloadTypeIdentifier.TERM_TO_TERM.value){ // length 11
192 i = bb.getInt(1);
193 cs.setLeftEnd(cs.start + 1);
194 cs.setRightStart(i);
195 cs.setRightEnd(i + 1);
196 }
197 else if (payloadTypeIdentifier == PayloadTypeIdentifier.TERM_TO_ELEMENT.value) { // length
198 // 15
199 cs.setLeftEnd(cs.start + 1);
200 cs.setRightStart(bb.getInt(1));
201 cs.setRightEnd(bb.getInt(5));
202 }
203 else if (payloadTypeIdentifier == PayloadTypeIdentifier.ELEMENT_TO_TERM.value) { // length
204 // 15
205 cs.setEnd(bb.getInt(1));
206 cs.setLeftEnd(cs.end);
207 i = bb.getInt(5);
208 cs.setRightStart(i);
209 cs.setRightEnd(i + 1);
210 }
211 else if (payloadTypeIdentifier == PayloadTypeIdentifier.ELEMENT_TO_ELEMENT.value) {
212 cs.setEnd(bb.getInt(1));
213 cs.setLeftEnd(cs.end);
214 cs.setRightStart(bb.getInt(5));
215 cs.setRightEnd(bb.getInt(9));
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000216 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000217
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000218 cs.setRightId(bb.getShort(length - 2)); //right id
219 cs.setLeftId(bb.getShort(length - 4)); //left id
220 cs.setSpanId(bb.getShort(length - 6)); //relation id
margaretha50c76332015-03-19 10:10:39 +0100221 // Payload is cleared.
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000222 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000223
Eliza Margaretha2db5e232015-03-04 10:20:01 +0000224
margaretha7ee65952015-12-14 15:39:12 +0100225 private void setPayload(CandidateSpan cs) throws IOException {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000226 ArrayList<byte[]> payload = new ArrayList<byte[]>();
227 if (relationTermSpan.isPayloadAvailable()) {
228 payload.addAll(relationTermSpan.getPayload());
229 }
margarethaca8d6222015-04-15 13:46:41 +0200230 if (direction == 0) {
231 payload.add(createClassPayload(cs.getLeftStart(), cs.getLeftEnd(),
margarethaf70addb2015-04-27 13:17:18 +0200232 tempSourceNum, false));
margarethaca8d6222015-04-15 13:46:41 +0200233 payload.add(createClassPayload(cs.getRightStart(),
margarethaf70addb2015-04-27 13:17:18 +0200234 cs.getRightEnd(), tempTargetNum, false));
235
236 if (sourceClass > 0) {
237 payload.add(createClassPayload(cs.getLeftStart(),
238 cs.getLeftEnd(), sourceClass, true));
239 }
240 if (targetClass > 0) {
241 payload.add(createClassPayload(cs.getRightStart(),
242 cs.getRightEnd(), targetClass, true));
243 }
244
margarethaca8d6222015-04-15 13:46:41 +0200245 }
246 else {
247 payload.add(createClassPayload(cs.getRightStart(),
margarethaf70addb2015-04-27 13:17:18 +0200248 cs.getRightEnd(), tempSourceNum, false));
margarethaca8d6222015-04-15 13:46:41 +0200249 payload.add(createClassPayload(cs.getLeftStart(), cs.getLeftEnd(),
margarethaf70addb2015-04-27 13:17:18 +0200250 tempTargetNum, false));
251
252 if (sourceClass > 0) {
253 payload.add(createClassPayload(cs.getRightStart(),
254 cs.getRightEnd(), sourceClass, true));
255 }
256 if (targetClass > 0) {
257 payload.add(createClassPayload(cs.getLeftStart(),
258 cs.getLeftEnd(), targetClass, true));
259 }
margarethaca8d6222015-04-15 13:46:41 +0200260 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000261 cs.setPayloads(payload);
262 }
263
Akronbb5d1732015-06-22 01:22:40 +0200264
265 private byte[] createClassPayload (int start, int end, byte classNumber,
margarethaf70addb2015-04-27 13:17:18 +0200266 boolean keep) {
267 ByteBuffer buffer = null;
268 if (keep) {
269 buffer = ByteBuffer.allocate(9);
270 }
271 else {
272 buffer = ByteBuffer.allocate(10);
273 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000274 buffer.putInt(start);
275 buffer.putInt(end);
276 buffer.put(classNumber);
277 return buffer.array();
278 }
279
Eliza Margaretha2db5e232015-03-04 10:20:01 +0000280
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000281 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000282 public boolean skipTo (int target) throws IOException {
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000283 if (hasMoreSpans && (firstSpans.doc() < target)) {
284 if (!firstSpans.skipTo(target)) {
285 candidateList.clear();
286 return false;
287 }
288 }
289 setCandidateList();
290 matchPayload.clear();
291 isStartEnumeration = false;
292 return advance();
293 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000294
Nils Diewaldbb33da22015-03-04 16:24:25 +0000295
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000296 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000297 public long cost () {
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000298 return firstSpans.cost();
299 }
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000300
Nils Diewaldbb33da22015-03-04 16:24:25 +0000301
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000302 /**
303 * Returns the right start position of the current RelationSpan.
304 *
305 * @return the right start position of the current RelationSpan.
306 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000307 public int getRightStart () {
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000308 return rightStart;
309 }
Eliza Margarethad12cabb2014-10-27 17:45:34 +0000310
Nils Diewaldbb33da22015-03-04 16:24:25 +0000311
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000312 /**
313 * Sets the right start position of the current RelationSpan.
314 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000315 * @param rightStart
316 * the right start position of the current RelationSpan
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000317 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000318 public void setRightStart (int rightStart) {
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000319 this.rightStart = rightStart;
320 }
Eliza Margarethad12cabb2014-10-27 17:45:34 +0000321
Nils Diewaldbb33da22015-03-04 16:24:25 +0000322
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000323 /**
324 * Returns the right end position of the current RelationSpan.
325 *
326 * @return the right end position of the current RelationSpan.
327 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000328 public int getRightEnd () {
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000329 return rightEnd;
330 }
Eliza Margarethad12cabb2014-10-27 17:45:34 +0000331
Nils Diewaldbb33da22015-03-04 16:24:25 +0000332
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000333 /**
334 * Sets the right end position of the current RelationSpan.
335 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000336 * @param rightEnd
337 * the right end position of the current RelationSpan.
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000338 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000339 public void setRightEnd (int rightEnd) {
Eliza Margaretha493bfa92015-01-13 16:16:38 +0000340 this.rightEnd = rightEnd;
341 }
Eliza Margarethad12cabb2014-10-27 17:45:34 +0000342
Eliza Margarethaf13b8ad2014-10-13 16:36:28 +0000343}