blob: a4d71594d4af4ef2af7918030863e76df05b8dc3 [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap.query.spans;
2
Eliza Margarethac7fb7312014-07-25 14:11:36 +00003import java.io.IOException;
Nils Diewaldf399a672013-11-18 17:55:22 +00004import java.nio.ByteBuffer;
Eliza Margarethac7fb7312014-07-25 14:11:36 +00005import java.util.ArrayList;
Eliza Margaretha23f98762014-10-30 17:34:47 +00006import java.util.Arrays;
Eliza Margarethac7fb7312014-07-25 14:11:36 +00007import java.util.Collections;
8import java.util.List;
9import java.util.Map;
Nils Diewaldf399a672013-11-18 17:55:22 +000010
Eliza Margarethac7fb7312014-07-25 14:11:36 +000011import org.apache.lucene.index.AtomicReaderContext;
12import org.apache.lucene.index.Term;
13import org.apache.lucene.index.TermContext;
14import org.apache.lucene.search.spans.Spans;
15import org.apache.lucene.search.spans.TermSpans;
16import org.apache.lucene.util.Bits;
Nils Diewaldf399a672013-11-18 17:55:22 +000017
Eliza Margarethac7fb7312014-07-25 14:11:36 +000018import de.ids_mannheim.korap.query.SpanElementQuery;
Nils Diewaldf399a672013-11-18 17:55:22 +000019
Eliza Margaretha19cecc62014-12-19 17:10:06 +000020/**
21 * Enumeration of spans which are elements such as phrases, sentences and
22 * paragraphs.
23 *
24 * @author margaretha
Nils Diewald44d5fa12015-01-15 21:31:52 +000025 * @author diewald
Nils Diewald82a4b862014-02-20 21:17:41 +000026 */
Eliza Margarethaf4611272014-10-16 08:45:33 +000027public class ElementSpans extends SpansWithId {
Nils Diewald82a4b862014-02-20 21:17:41 +000028
Eliza Margaretha19cecc62014-12-19 17:10:06 +000029 private List<CandidateElementSpan> candidateList;
30 private int currentDoc, currentPosition;
31 private TermSpans termSpans;
Nils Diewald1455e1e2014-08-01 16:12:43 +000032
Eliza Margaretha19cecc62014-12-19 17:10:06 +000033 /**
34 * Constructs ElementSpans for the given {@link SpanElementQuery}.
35 *
36 * @param spanElementQuery a SpanElementQuery
37 * @param context
38 * @param acceptDocs
39 * @param termContexts
40 * @throws IOException
41 */
42 public ElementSpans(SpanElementQuery spanElementQuery,
43 AtomicReaderContext context, Bits acceptDocs,
44 Map<Term, TermContext> termContexts) throws IOException {
45 super(spanElementQuery, context, acceptDocs, termContexts);
46 candidateList = new ArrayList<>();
47 termSpans = (TermSpans) firstSpans;
48 hasMoreSpans = termSpans.next();
49 if (hasMoreSpans) {
50 currentDoc = termSpans.doc();
51 currentPosition = termSpans.start();
52 }
53 }
Nils Diewaldf399a672013-11-18 17:55:22 +000054
Eliza Margaretha19cecc62014-12-19 17:10:06 +000055 @Override
56 public boolean next() throws IOException {
57 isStartEnumeration = false;
58 return advance();
59 }
Eliza Margarethac7fb7312014-07-25 14:11:36 +000060
Eliza Margaretha19cecc62014-12-19 17:10:06 +000061 /**
62 * Advances the ElementSpans to the next match by first checking the
63 * candidate match list. If the list is empty, it will be set/filled in
64 * first. Tells if there is a next match or not.
65 *
66 * @return <code>true</code> if a match is found, <code>false</code>
67 * otherwise.
68 * @throws IOException
69 */
70 private boolean advance() throws IOException {
71 while (hasMoreSpans || !candidateList.isEmpty()) {
72 if (!candidateList.isEmpty()) {
73 CandidateElementSpan cs = candidateList.get(0);
74 this.matchDocNumber = cs.getDoc();
75 this.matchStartPosition = cs.getStart();
76 this.matchEndPosition = cs.getEnd();
77 this.matchPayload = cs.getPayloads();
78 // this.setElementRef(cs.getSpanId());
79 this.setSpanId(cs.getSpanId());
80 candidateList.remove(0);
81 return true;
82 } else {
83 // logger.info("Setting candidate list");
84 setCandidateList();
85 currentDoc = termSpans.doc();
86 currentPosition = termSpans.start();
87 }
88 }
89 return false;
90 }
Nils Diewaldf399a672013-11-18 17:55:22 +000091
Eliza Margaretha19cecc62014-12-19 17:10:06 +000092 /**
93 * Collects all the elements starting at the same position and sort them by
94 * their end positions. The list starts with the element having the smallest
95 * end position.
96 *
97 * @throws IOException
98 */
99 private void setCandidateList() throws IOException {
100 while (hasMoreSpans && termSpans.doc() == currentDoc
101 && termSpans.start() == currentPosition) {
102 CandidateElementSpan cs = new CandidateElementSpan(termSpans,
103 spanId);
104 // elementRef);
105 readPayload(cs);
106 candidateList.add(cs);
107 hasMoreSpans = termSpans.next();
108 }
109 Collections.sort(candidateList);
110 }
111
112 /**
113 * Reads the payloads of the termSpan and sets the end position and element
114 * id from the payloads for the candidate match. The payloads for
115 * character-offsets are set as the candidate match payloads. <br/>
116 * <br/>
117 * <em>Note</em>: payloadbuffer should actually collects all other payload
118 * beside end position and element id, but KorapIndex identify element's
119 * payloads by its length (8), which represents the character offset
120 * payloads. So these offsets are directly set as the candidate match
121 * payload.
122 *
123 * @param cs a candidate match
124 * @throws IOException
125 */
126 private void readPayload(CandidateElementSpan cs) throws IOException {
127 List<byte[]> payload = (List<byte[]>) termSpans.getPayload();
128 int length = payload.get(0).length;
129 ByteBuffer bb = ByteBuffer.allocate(length);
130 bb.put(payload.get(0));
131
132 if (!payload.isEmpty()) {
133 // set element end position from payload
134 cs.setEnd(bb.getInt(8));
135
136 if (hasSpanId) { // copy element id
137 cs.setSpanId(bb.getShort(12));
138 } else { // set element id -1
139 cs.setSpanId((short) -1);
140 }
141 // Copy the start and end character offsets
142 byte[] b = new byte[8];
143 b = Arrays.copyOfRange(bb.array(), 0, 8);
144 cs.setPayloads(Collections.singletonList(b));
145 } else {
146 cs.setEnd(cs.getStart());
147 cs.setSpanId((short) -1);
148 cs.setPayloads(null);
149 }
150 }
151
152 @Override
153 public boolean skipTo(int target) throws IOException {
154 if (hasMoreSpans && (firstSpans.doc() < target)) {
155 if (!firstSpans.skipTo(target)) {
156 candidateList.clear();
157 return false;
158 }
159 }
160 setCandidateList();
161 matchPayload.clear();
162 isStartEnumeration = false;
163 return advance();
164 }
165
166 @Override
167 public long cost() {
168 return termSpans.cost();
169 }
170
171 /**
172 * Match candidate for element spans.
173 *
174 * @author margaretha
175 *
176 */
177 class CandidateElementSpan extends CandidateSpan {
178
179 private short elementId;
180
181 public CandidateElementSpan(Spans span, short elementId)
182 throws IOException {
183 super(span);
184 setSpanId(elementId);
185 }
186
187 public void setSpanId(short elementId) {
188 this.elementId = elementId;
189 }
190
191 public short getSpanId() {
192 return elementId;
193 }
194 }
Nils Diewaldf399a672013-11-18 17:55:22 +0000195};