| Eliza Margaretha | 7ee76da | 2014-08-12 15:32:33 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| Eliza Margaretha | 656cb31 | 2014-08-14 12:42:26 +0000 | [diff] [blame] | 4 | import java.nio.ByteBuffer; |
| Eliza Margaretha | 7ee76da | 2014-08-12 15:32:33 +0000 | [diff] [blame] | 5 | import java.util.ArrayList; |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 6 | import java.util.Collections; |
| 7 | import java.util.List; |
| Eliza Margaretha | 7ee76da | 2014-08-12 15:32:33 +0000 | [diff] [blame] | 8 | import java.util.Map; |
| margaretha | f151c96 | 2018-11-27 17:38:59 +0100 | [diff] [blame] | 9 | import java.util.TreeSet; |
| Eliza Margaretha | 7ee76da | 2014-08-12 15:32:33 +0000 | [diff] [blame] | 10 | |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 11 | import org.apache.lucene.index.LeafReaderContext; |
| Eliza Margaretha | 7ee76da | 2014-08-12 15:32:33 +0000 | [diff] [blame] | 12 | import org.apache.lucene.index.Term; |
| 13 | import org.apache.lucene.index.TermContext; |
| 14 | import org.apache.lucene.util.Bits; |
| 15 | |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 16 | import org.slf4j.Logger; |
| 17 | import org.slf4j.LoggerFactory; |
| 18 | |
| Eliza Margaretha | 7ee76da | 2014-08-12 15:32:33 +0000 | [diff] [blame] | 19 | import de.ids_mannheim.korap.query.SpanExpansionQuery; |
| 20 | |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 21 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 22 | * Enumeration of spans expanded with minimum <code>m</code> and |
| margaretha | f151c96 | 2018-11-27 17:38:59 +0100 | [diff] [blame] | 23 | * maximum <code>n</code> token positions to either left (negative) |
| 24 | * or right direction from the original spans. See examples in |
| 25 | * {@link SpanExpansionQuery}. |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 26 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 27 | * The expansion offsets, namely the start and end position of an |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 28 | * expansion part, can be stored in payloads. A class number is |
| margaretha | f151c96 | 2018-11-27 17:38:59 +0100 | [diff] [blame] | 29 | * assigned to the offsets grouping them altogether. |
| Eliza Margaretha | afe9812 | 2015-01-23 17:37:57 +0000 | [diff] [blame] | 30 | * |
| Eliza Margaretha | 656cb31 | 2014-08-14 12:42:26 +0000 | [diff] [blame] | 31 | * @author margaretha |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 32 | */ |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 33 | public class ExpandedSpans extends SimpleSpans { |
| Eliza Margaretha | 7ee76da | 2014-08-12 15:32:33 +0000 | [diff] [blame] | 34 | |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 35 | private int min, max; |
| 36 | private byte classNumber; |
| 37 | private int direction; |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 38 | private List<CandidateSpan> candidateSpans; |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 39 | private long matchCost; |
| Eliza Margaretha | 7ee76da | 2014-08-12 15:32:33 +0000 | [diff] [blame] | 40 | |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 41 | // Logger |
| 42 | private final Logger log = LoggerFactory.getLogger(ExpandedSpans.class); |
| 43 | |
| 44 | // This advices the java compiler to ignore all loggings |
| 45 | public static final boolean DEBUG = false; |
| 46 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 47 | |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 48 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 49 | * Constructs ExpandedSpans from the given |
| 50 | * {@link SpanExpansionQuery}. |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 51 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 52 | * @param spanExpansionQuery |
| 53 | * a SpanExpansionQuery |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 54 | * @param context |
| 55 | * @param acceptDocs |
| 56 | * @param termContexts |
| 57 | * @throws IOException |
| 58 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 59 | public ExpandedSpans (SpanExpansionQuery spanExpansionQuery, |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 60 | LeafReaderContext context, Bits acceptDocs, |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 61 | Map<Term, TermContext> termContexts) |
| 62 | throws IOException { |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 63 | super(spanExpansionQuery, context, acceptDocs, termContexts); |
| 64 | this.min = spanExpansionQuery.getMin(); |
| 65 | this.max = spanExpansionQuery.getMax(); |
| 66 | this.direction = spanExpansionQuery.getDirection(); |
| 67 | this.classNumber = spanExpansionQuery.getClassNumber(); |
| Eliza Margaretha | 7ee76da | 2014-08-12 15:32:33 +0000 | [diff] [blame] | 68 | |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 69 | candidateSpans = new ArrayList<CandidateSpan>(); |
| 70 | hasMoreSpans = firstSpans.next(); |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 71 | } |
| Eliza Margaretha | 7ee76da | 2014-08-12 15:32:33 +0000 | [diff] [blame] | 72 | |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 73 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 74 | public boolean next () throws IOException { |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 75 | matchPayload.clear(); |
| 76 | isStartEnumeration = false; |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 77 | return advance(); |
| 78 | } |
| Eliza Margaretha | 7ee76da | 2014-08-12 15:32:33 +0000 | [diff] [blame] | 79 | |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 80 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 81 | * Advances the ExpandedSpans to the next match by setting the |
| 82 | * first element |
| 83 | * in the candidateList as the match. Set the candidateList, if it |
| 84 | * is empty |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 85 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 86 | * @return <code>true</code> if a match is found, |
| 87 | * <code>false</code> |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 88 | * otherwise. |
| 89 | * @throws IOException |
| 90 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 91 | private boolean advance () throws IOException { |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 92 | while (candidateSpans.size() > 0 || hasMoreSpans) { |
| 93 | if (candidateSpans.size() > 0) { |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 94 | CandidateSpan cs = candidateSpans.get(0); |
| margaretha | f151c96 | 2018-11-27 17:38:59 +0100 | [diff] [blame] | 95 | setMatch(cs); |
| 96 | candidateSpans.remove(cs); |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 97 | return true; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 98 | } |
| 99 | else { |
| margaretha | 52a0d11 | 2018-11-28 12:58:55 +0100 | [diff] [blame] | 100 | setCandidateList(); |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 101 | Collections.sort(candidateSpans); |
| 102 | if (DEBUG) { |
| 103 | log.debug(candidateSpans.toString()); |
| 104 | }; |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 105 | } |
| 106 | } |
| 107 | return false; |
| 108 | } |
| 109 | |
| 110 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 111 | * Sets the candidateList by adding new candidate match spans for |
| margaretha | f151c96 | 2018-11-27 17:38:59 +0100 | [diff] [blame] | 112 | * all possible expansion with respect to the expansion length |
| 113 | * (min,max) variables. |
| 114 | * |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 115 | * @throws IOException |
| 116 | */ |
| margaretha | 52a0d11 | 2018-11-28 12:58:55 +0100 | [diff] [blame] | 117 | private void setCandidateList () throws IOException { |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 118 | CandidateSpan cs; |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 119 | int counter, start, end = 0; |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 120 | |
| margaretha | f151c96 | 2018-11-27 17:38:59 +0100 | [diff] [blame] | 121 | if (direction < 0) { // left |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 122 | counter = max; |
| 123 | while (counter >= min) { |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 124 | start = firstSpans.start() - counter; |
| 125 | if (start >= 0) { |
| 126 | cs = new CandidateSpan(start, firstSpans.end(), |
| 127 | firstSpans.doc(), firstSpans.cost(), |
| 128 | createPayloads(start, firstSpans.start())); |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 129 | |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 130 | candidateSpans.add(cs); |
| 131 | } |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 132 | counter--; |
| 133 | } |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 134 | |
| margaretha | 52a0d11 | 2018-11-28 12:58:55 +0100 | [diff] [blame] | 135 | int lastPosition = firstSpans.start(); |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 136 | if (hasMoreSpans && (hasMoreSpans = firstSpans.next())) { |
| margaretha | f151c96 | 2018-11-27 17:38:59 +0100 | [diff] [blame] | 137 | start = Math.max(0, firstSpans.start() - max); |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 138 | if (DEBUG) { |
| 139 | log.debug("next candidate start: " + start + ", lastPosition " |
| 140 | + lastPosition); |
| 141 | }; |
| margaretha | f151c96 | 2018-11-27 17:38:59 +0100 | [diff] [blame] | 142 | if (start <= lastPosition) { |
| margaretha | 52a0d11 | 2018-11-28 12:58:55 +0100 | [diff] [blame] | 143 | setCandidateList(); |
| margaretha | f151c96 | 2018-11-27 17:38:59 +0100 | [diff] [blame] | 144 | } |
| 145 | } |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 146 | } |
| 147 | else { |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 148 | counter = min; |
| 149 | while (counter <= max) { |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 150 | // TODO: How do I know if the end is already too far |
| 151 | // (over the end of the doc)? |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 152 | end = firstSpans.end() + counter; |
| 153 | cs = new CandidateSpan(firstSpans.start(), end, |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 154 | firstSpans.doc(), firstSpans.cost(), |
| 155 | createPayloads(firstSpans.end(), end)); |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 156 | candidateSpans.add(cs); |
| 157 | counter++; |
| 158 | } |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 159 | |
| 160 | int lastPosition = end; |
| 161 | if (hasMoreSpans && (hasMoreSpans = firstSpans.next())) { |
| 162 | if (DEBUG) { |
| 163 | log.debug("next candidate start: " + firstSpans.start() |
| 164 | + ", lastPosition " + lastPosition); |
| 165 | }; |
| 166 | if (firstSpans.start() <= lastPosition) { |
| 167 | setCandidateList(); |
| 168 | } |
| 169 | } |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 170 | } |
| 171 | } |
| 172 | |
| 173 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 174 | * Prepares the payloads for a candidate match (ExpandedSpans). If |
| margaretha | f151c96 | 2018-11-27 17:38:59 +0100 | [diff] [blame] | 175 | * the class number is set, the extension offsets with the given |
| 176 | * start and end positions are to be stored in the payloads. |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 177 | * |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 178 | * @param start |
| 179 | * start position |
| 180 | * @param end |
| 181 | * end position |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 182 | * @return the payloads for a candidaete match |
| 183 | * @throws IOException |
| 184 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 185 | private ArrayList<byte[]> createPayloads (int start, int end) |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 186 | throws IOException { |
| 187 | |
| 188 | ArrayList<byte[]> payload = new ArrayList<byte[]>(); |
| 189 | if (firstSpans.isPayloadAvailable()) { |
| 190 | payload.addAll(firstSpans.getPayload()); |
| 191 | } |
| 192 | if (classNumber > 0) { |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 193 | // System.out.println("Extension offsets "+start+","+end); |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 194 | payload.add(createExtensionPayloads(start, end)); |
| 195 | } |
| 196 | return payload; |
| 197 | } |
| 198 | |
| 199 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 200 | * Prepares a byte array of extension offsets with the given start |
| margaretha | f151c96 | 2018-11-27 17:38:59 +0100 | [diff] [blame] | 201 | * and end positions and the class number, to be stored in |
| 202 | * payloads. |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 203 | * |
| margaretha | 21e4ca2 | 2018-11-28 14:25:46 +0100 | [diff] [blame^] | 204 | * @param start |
| 205 | * start position |
| 206 | * @param end |
| 207 | * end position |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 208 | * @return a byte array of extension offsets and the class number |
| 209 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 210 | private byte[] createExtensionPayloads (int start, int end) { |
| Akron | 4299355 | 2016-02-04 13:24:24 +0100 | [diff] [blame] | 211 | ByteBuffer buffer = ByteBuffer.allocate(10); |
| 212 | Byte classPTI = 0; |
| 213 | buffer.put(classPTI); |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 214 | buffer.putInt(start); |
| 215 | buffer.putInt(end); |
| 216 | buffer.put(classNumber); |
| 217 | return buffer.array(); |
| 218 | } |
| 219 | |
| 220 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 221 | * Sets the properties of the given candidate match span as the |
| margaretha | f151c96 | 2018-11-27 17:38:59 +0100 | [diff] [blame] | 222 | * current match (state of ExpandedSpans). |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 223 | * |
| 224 | * @param candidateSpan |
| 225 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 226 | private void setMatch (CandidateSpan candidateSpan) { |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 227 | matchDocNumber = candidateSpan.getDoc(); |
| 228 | matchStartPosition = candidateSpan.getStart(); |
| 229 | matchEndPosition = candidateSpan.getEnd(); |
| 230 | matchPayload = candidateSpan.getPayloads(); |
| 231 | matchCost = candidateSpan.getCost(); |
| 232 | } |
| 233 | |
| 234 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 235 | public boolean skipTo (int target) throws IOException { |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 236 | if (hasMoreSpans && (firstSpans.doc() < target)) { |
| 237 | if (!firstSpans.skipTo(target)) { |
| 238 | hasMoreSpans = false; |
| 239 | return false; |
| 240 | } |
| 241 | } |
| 242 | matchPayload.clear(); |
| 243 | return advance(); |
| 244 | } |
| 245 | |
| 246 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 247 | public long cost () { |
| Eliza Margaretha | dffd059 | 2015-01-15 18:24:39 +0000 | [diff] [blame] | 248 | return matchCost; |
| 249 | } |
| Eliza Margaretha | 7ee76da | 2014-08-12 15:32:33 +0000 | [diff] [blame] | 250 | |
| 251 | } |