blob: cb2bbd763497aaa2616d79f706fc1d04178b0845 [file] [log] [blame]
Eliza Margaretha7ee76da2014-08-12 15:32:33 +00001package de.ids_mannheim.korap.query.spans;
2
3import java.io.IOException;
Eliza Margaretha656cb312014-08-14 12:42:26 +00004import java.nio.ByteBuffer;
Eliza Margaretha7ee76da2014-08-12 15:32:33 +00005import java.util.ArrayList;
margaretha21e4ca22018-11-28 14:25:46 +01006import java.util.Collections;
7import java.util.List;
Eliza Margaretha7ee76da2014-08-12 15:32:33 +00008import java.util.Map;
margarethaf151c962018-11-27 17:38:59 +01009import java.util.TreeSet;
Eliza Margaretha7ee76da2014-08-12 15:32:33 +000010
Akron700c1eb2015-09-25 16:57:30 +020011import org.apache.lucene.index.LeafReaderContext;
Eliza Margaretha7ee76da2014-08-12 15:32:33 +000012import org.apache.lucene.index.Term;
13import org.apache.lucene.index.TermContext;
14import org.apache.lucene.util.Bits;
15
margaretha21e4ca22018-11-28 14:25:46 +010016import org.slf4j.Logger;
17import org.slf4j.LoggerFactory;
18
Eliza Margaretha7ee76da2014-08-12 15:32:33 +000019import de.ids_mannheim.korap.query.SpanExpansionQuery;
20
Eliza Margarethadffd0592015-01-15 18:24:39 +000021/**
Nils Diewaldbb33da22015-03-04 16:24:25 +000022 * Enumeration of spans expanded with minimum <code>m</code> and
margarethaf151c962018-11-27 17:38:59 +010023 * maximum <code>n</code> token positions to either left (negative)
24 * or right direction from the original spans. See examples in
25 * {@link SpanExpansionQuery}.
Eliza Margarethadffd0592015-01-15 18:24:39 +000026 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000027 * The expansion offsets, namely the start and end position of an
margaretha21e4ca22018-11-28 14:25:46 +010028 * expansion part, can be stored in payloads. A class number is
margarethaf151c962018-11-27 17:38:59 +010029 * assigned to the offsets grouping them altogether.
Eliza Margarethaafe98122015-01-23 17:37:57 +000030 *
Eliza Margaretha656cb312014-08-14 12:42:26 +000031 * @author margaretha
Eliza Margaretha6f989202016-10-14 21:48:29 +020032 */
Eliza Margarethadffd0592015-01-15 18:24:39 +000033public class ExpandedSpans extends SimpleSpans {
Eliza Margaretha7ee76da2014-08-12 15:32:33 +000034
Eliza Margarethadffd0592015-01-15 18:24:39 +000035 private int min, max;
36 private byte classNumber;
37 private int direction;
margaretha21e4ca22018-11-28 14:25:46 +010038 private List<CandidateSpan> candidateSpans;
Eliza Margarethadffd0592015-01-15 18:24:39 +000039 private long matchCost;
Eliza Margaretha7ee76da2014-08-12 15:32:33 +000040
margaretha21e4ca22018-11-28 14:25:46 +010041 // Logger
42 private final Logger log = LoggerFactory.getLogger(ExpandedSpans.class);
43
44 // This advices the java compiler to ignore all loggings
45 public static final boolean DEBUG = false;
46
Nils Diewaldbb33da22015-03-04 16:24:25 +000047
Eliza Margarethadffd0592015-01-15 18:24:39 +000048 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +000049 * Constructs ExpandedSpans from the given
50 * {@link SpanExpansionQuery}.
Eliza Margarethadffd0592015-01-15 18:24:39 +000051 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000052 * @param spanExpansionQuery
53 * a SpanExpansionQuery
Eliza Margarethadffd0592015-01-15 18:24:39 +000054 * @param context
55 * @param acceptDocs
56 * @param termContexts
57 * @throws IOException
58 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000059 public ExpandedSpans (SpanExpansionQuery spanExpansionQuery,
Akron700c1eb2015-09-25 16:57:30 +020060 LeafReaderContext context, Bits acceptDocs,
Nils Diewaldbb33da22015-03-04 16:24:25 +000061 Map<Term, TermContext> termContexts)
62 throws IOException {
Eliza Margarethadffd0592015-01-15 18:24:39 +000063 super(spanExpansionQuery, context, acceptDocs, termContexts);
64 this.min = spanExpansionQuery.getMin();
65 this.max = spanExpansionQuery.getMax();
66 this.direction = spanExpansionQuery.getDirection();
67 this.classNumber = spanExpansionQuery.getClassNumber();
Eliza Margaretha7ee76da2014-08-12 15:32:33 +000068
margaretha21e4ca22018-11-28 14:25:46 +010069 candidateSpans = new ArrayList<CandidateSpan>();
70 hasMoreSpans = firstSpans.next();
Eliza Margarethadffd0592015-01-15 18:24:39 +000071 }
Eliza Margaretha7ee76da2014-08-12 15:32:33 +000072
Eliza Margarethadffd0592015-01-15 18:24:39 +000073 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +000074 public boolean next () throws IOException {
Eliza Margarethadffd0592015-01-15 18:24:39 +000075 matchPayload.clear();
76 isStartEnumeration = false;
Eliza Margarethadffd0592015-01-15 18:24:39 +000077 return advance();
78 }
Eliza Margaretha7ee76da2014-08-12 15:32:33 +000079
Eliza Margarethadffd0592015-01-15 18:24:39 +000080 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +000081 * Advances the ExpandedSpans to the next match by setting the
82 * first element
83 * in the candidateList as the match. Set the candidateList, if it
84 * is empty
Eliza Margarethadffd0592015-01-15 18:24:39 +000085 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000086 * @return <code>true</code> if a match is found,
87 * <code>false</code>
Eliza Margarethadffd0592015-01-15 18:24:39 +000088 * otherwise.
89 * @throws IOException
90 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000091 private boolean advance () throws IOException {
Eliza Margarethadffd0592015-01-15 18:24:39 +000092 while (candidateSpans.size() > 0 || hasMoreSpans) {
93 if (candidateSpans.size() > 0) {
margaretha21e4ca22018-11-28 14:25:46 +010094 CandidateSpan cs = candidateSpans.get(0);
margarethaf151c962018-11-27 17:38:59 +010095 setMatch(cs);
96 candidateSpans.remove(cs);
Eliza Margarethadffd0592015-01-15 18:24:39 +000097 return true;
Nils Diewaldbb33da22015-03-04 16:24:25 +000098 }
99 else {
margaretha52a0d112018-11-28 12:58:55 +0100100 setCandidateList();
margaretha21e4ca22018-11-28 14:25:46 +0100101 Collections.sort(candidateSpans);
102 if (DEBUG) {
103 log.debug(candidateSpans.toString());
104 };
Eliza Margarethadffd0592015-01-15 18:24:39 +0000105 }
106 }
107 return false;
108 }
109
110 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000111 * Sets the candidateList by adding new candidate match spans for
margarethaf151c962018-11-27 17:38:59 +0100112 * all possible expansion with respect to the expansion length
113 * (min,max) variables.
114 *
Eliza Margarethadffd0592015-01-15 18:24:39 +0000115 * @throws IOException
116 */
margaretha52a0d112018-11-28 12:58:55 +0100117 private void setCandidateList () throws IOException {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000118 CandidateSpan cs;
margaretha21e4ca22018-11-28 14:25:46 +0100119 int counter, start, end = 0;
Eliza Margarethadffd0592015-01-15 18:24:39 +0000120
margarethaf151c962018-11-27 17:38:59 +0100121 if (direction < 0) { // left
Eliza Margarethadffd0592015-01-15 18:24:39 +0000122 counter = max;
123 while (counter >= min) {
margaretha21e4ca22018-11-28 14:25:46 +0100124 start = firstSpans.start() - counter;
125 if (start >= 0) {
126 cs = new CandidateSpan(start, firstSpans.end(),
127 firstSpans.doc(), firstSpans.cost(),
128 createPayloads(start, firstSpans.start()));
Eliza Margarethadffd0592015-01-15 18:24:39 +0000129
margaretha21e4ca22018-11-28 14:25:46 +0100130 candidateSpans.add(cs);
131 }
Eliza Margarethadffd0592015-01-15 18:24:39 +0000132 counter--;
133 }
margaretha21e4ca22018-11-28 14:25:46 +0100134
margaretha52a0d112018-11-28 12:58:55 +0100135 int lastPosition = firstSpans.start();
margaretha21e4ca22018-11-28 14:25:46 +0100136 if (hasMoreSpans && (hasMoreSpans = firstSpans.next())) {
margarethaf151c962018-11-27 17:38:59 +0100137 start = Math.max(0, firstSpans.start() - max);
margaretha21e4ca22018-11-28 14:25:46 +0100138 if (DEBUG) {
139 log.debug("next candidate start: " + start + ", lastPosition "
140 + lastPosition);
141 };
margarethaf151c962018-11-27 17:38:59 +0100142 if (start <= lastPosition) {
margaretha52a0d112018-11-28 12:58:55 +0100143 setCandidateList();
margarethaf151c962018-11-27 17:38:59 +0100144 }
145 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000146 }
147 else {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000148 counter = min;
149 while (counter <= max) {
margaretha21e4ca22018-11-28 14:25:46 +0100150 // TODO: How do I know if the end is already too far
151 // (over the end of the doc)?
Eliza Margarethadffd0592015-01-15 18:24:39 +0000152 end = firstSpans.end() + counter;
153 cs = new CandidateSpan(firstSpans.start(), end,
Eliza Margaretha6f989202016-10-14 21:48:29 +0200154 firstSpans.doc(), firstSpans.cost(),
155 createPayloads(firstSpans.end(), end));
Eliza Margarethadffd0592015-01-15 18:24:39 +0000156 candidateSpans.add(cs);
157 counter++;
158 }
margaretha21e4ca22018-11-28 14:25:46 +0100159
160 int lastPosition = end;
161 if (hasMoreSpans && (hasMoreSpans = firstSpans.next())) {
162 if (DEBUG) {
163 log.debug("next candidate start: " + firstSpans.start()
164 + ", lastPosition " + lastPosition);
165 };
166 if (firstSpans.start() <= lastPosition) {
167 setCandidateList();
168 }
169 }
Eliza Margarethadffd0592015-01-15 18:24:39 +0000170 }
171 }
172
173 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000174 * Prepares the payloads for a candidate match (ExpandedSpans). If
margarethaf151c962018-11-27 17:38:59 +0100175 * the class number is set, the extension offsets with the given
176 * start and end positions are to be stored in the payloads.
Eliza Margarethadffd0592015-01-15 18:24:39 +0000177 *
margaretha21e4ca22018-11-28 14:25:46 +0100178 * @param start
179 * start position
180 * @param end
181 * end position
Eliza Margarethadffd0592015-01-15 18:24:39 +0000182 * @return the payloads for a candidaete match
183 * @throws IOException
184 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000185 private ArrayList<byte[]> createPayloads (int start, int end)
Eliza Margarethadffd0592015-01-15 18:24:39 +0000186 throws IOException {
187
188 ArrayList<byte[]> payload = new ArrayList<byte[]>();
189 if (firstSpans.isPayloadAvailable()) {
190 payload.addAll(firstSpans.getPayload());
191 }
192 if (classNumber > 0) {
margaretha21e4ca22018-11-28 14:25:46 +0100193 // System.out.println("Extension offsets "+start+","+end);
Eliza Margarethadffd0592015-01-15 18:24:39 +0000194 payload.add(createExtensionPayloads(start, end));
195 }
196 return payload;
197 }
198
199 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000200 * Prepares a byte array of extension offsets with the given start
margarethaf151c962018-11-27 17:38:59 +0100201 * and end positions and the class number, to be stored in
202 * payloads.
Eliza Margarethadffd0592015-01-15 18:24:39 +0000203 *
margaretha21e4ca22018-11-28 14:25:46 +0100204 * @param start
205 * start position
206 * @param end
207 * end position
Eliza Margarethadffd0592015-01-15 18:24:39 +0000208 * @return a byte array of extension offsets and the class number
209 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000210 private byte[] createExtensionPayloads (int start, int end) {
Akron42993552016-02-04 13:24:24 +0100211 ByteBuffer buffer = ByteBuffer.allocate(10);
212 Byte classPTI = 0;
213 buffer.put(classPTI);
Eliza Margarethadffd0592015-01-15 18:24:39 +0000214 buffer.putInt(start);
215 buffer.putInt(end);
216 buffer.put(classNumber);
217 return buffer.array();
218 }
219
220 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000221 * Sets the properties of the given candidate match span as the
margarethaf151c962018-11-27 17:38:59 +0100222 * current match (state of ExpandedSpans).
Eliza Margarethadffd0592015-01-15 18:24:39 +0000223 *
224 * @param candidateSpan
225 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000226 private void setMatch (CandidateSpan candidateSpan) {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000227 matchDocNumber = candidateSpan.getDoc();
228 matchStartPosition = candidateSpan.getStart();
229 matchEndPosition = candidateSpan.getEnd();
230 matchPayload = candidateSpan.getPayloads();
231 matchCost = candidateSpan.getCost();
232 }
233
234 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000235 public boolean skipTo (int target) throws IOException {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000236 if (hasMoreSpans && (firstSpans.doc() < target)) {
237 if (!firstSpans.skipTo(target)) {
238 hasMoreSpans = false;
239 return false;
240 }
241 }
242 matchPayload.clear();
243 return advance();
244 }
245
246 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000247 public long cost () {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000248 return matchCost;
249 }
Eliza Margaretha7ee76da2014-08-12 15:32:33 +0000250
251}