| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| 4 | import java.nio.ByteBuffer; |
| 5 | import java.util.ArrayList; |
| 6 | import java.util.Collections; |
| 7 | import java.util.List; |
| 8 | import java.util.Map; |
| 9 | |
| 10 | import org.apache.lucene.index.AtomicReaderContext; |
| 11 | import org.apache.lucene.index.Term; |
| 12 | import org.apache.lucene.index.TermContext; |
| 13 | import org.apache.lucene.search.spans.Spans; |
| Eliza Margaretha | 8551e5b | 2014-12-15 16:46:18 +0000 | [diff] [blame] | 14 | import org.apache.lucene.search.spans.TermSpans; |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 15 | import org.apache.lucene.util.Bits; |
| 16 | import org.slf4j.Logger; |
| 17 | import org.slf4j.LoggerFactory; |
| 18 | |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 19 | import de.ids_mannheim.korap.query.SpanAttributeQuery; |
| 20 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 21 | /** |
| 22 | * UPDATE THIS! |
| 23 | * Span enumeration of attributes which are term spans with special |
| 24 | * payload |
| 25 | * assignments referring to another span (e.g. element/relation span) |
| 26 | * to which |
| 27 | * an attribute span belongs. The class is basically a wrapper of |
| 28 | * Lucene {@link TermSpans} with additional functionality regarding |
| 29 | * element/relation |
| 30 | * reference. Element/relation id is annotated ascendingly starting |
| 31 | * from the |
| Eliza Margaretha | 8551e5b | 2014-12-15 16:46:18 +0000 | [diff] [blame] | 32 | * left side. <br/> |
| 33 | * <br/> |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 34 | * The enumeration is ordered firstly by the start position of the |
| 35 | * attribute and |
| 36 | * secondly by the element/relation id descendingly. This order helps |
| 37 | * to match |
| Eliza Margaretha | 8551e5b | 2014-12-15 16:46:18 +0000 | [diff] [blame] | 38 | * element and attributes faster. |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 39 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 40 | * AttributeSpans contain information about the elements they belongs |
| 41 | * to, thus |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 42 | * querying them alone is sufficient to get |
| 43 | * "any element having a specific attribute". |
| 44 | * |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 45 | * @author margaretha |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 46 | * */ |
| margaretha | 9d0f76a | 2015-03-19 10:10:39 +0100 | [diff] [blame^] | 47 | public class AttributeSpans extends SimpleSpans { |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 48 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 49 | private List<CandidateAttributeSpan> candidateList; |
| 50 | private int currentDoc, currentPosition; |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 51 | private boolean isFinish; |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 52 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 53 | protected Logger logger = LoggerFactory.getLogger(AttributeSpans.class); |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 54 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 55 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 56 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 57 | * Constructs Attributespans based on the specified |
| 58 | * SpanAttributeQuery. |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 59 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 60 | * @param spanAttributeQuery |
| 61 | * a spanAttributeQuery |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 62 | * @param context |
| 63 | * @param acceptDocs |
| 64 | * @param termContexts |
| 65 | * @throws IOException |
| 66 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 67 | public AttributeSpans (SpanAttributeQuery spanAttributeQuery, |
| 68 | AtomicReaderContext context, Bits acceptDocs, |
| 69 | Map<Term, TermContext> termContexts) |
| 70 | throws IOException { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 71 | super(spanAttributeQuery, context, acceptDocs, termContexts); |
| margaretha | 9d0f76a | 2015-03-19 10:10:39 +0100 | [diff] [blame^] | 72 | this.hasSpanId = true; |
| 73 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 74 | candidateList = new ArrayList<>(); |
| 75 | hasMoreSpans = firstSpans.next(); |
| 76 | if (hasMoreSpans) { |
| 77 | currentDoc = firstSpans.doc(); |
| 78 | currentPosition = firstSpans.start(); |
| 79 | } |
| 80 | } |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 81 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 82 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 83 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 84 | public boolean next () throws IOException { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 85 | isStartEnumeration = false; |
| 86 | matchPayload.clear(); |
| 87 | return advance(); |
| 88 | } |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 89 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 90 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 91 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 92 | * Moves to the next match by checking the candidate match list or |
| 93 | * setting |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 94 | * the list first when it is empty. |
| 95 | * |
| 96 | * @return true if a match is found |
| 97 | * @throws IOException |
| 98 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 99 | private boolean advance () throws IOException { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 100 | while (hasMoreSpans || !candidateList.isEmpty()) { |
| 101 | if (!candidateList.isEmpty()) { |
| 102 | // set the current match from the first CandidateAttributeSpan |
| 103 | // in the candidate list |
| 104 | CandidateAttributeSpan cs = candidateList.get(0); |
| 105 | this.matchDocNumber = cs.getDoc(); |
| 106 | this.matchStartPosition = cs.getStart(); |
| 107 | this.matchEndPosition = cs.getEnd(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 108 | this.setSpanId(cs.getSpanId()); // referentId |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 109 | candidateList.remove(0); |
| 110 | return true; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 111 | } |
| 112 | else { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 113 | setCandidateList(); |
| 114 | currentDoc = firstSpans.doc(); |
| 115 | currentPosition = firstSpans.start(); |
| 116 | } |
| 117 | } |
| 118 | return false; |
| 119 | } |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 120 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 121 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 122 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 123 | * Collects all the attributes in the same start position and sort |
| 124 | * them by |
| 125 | * element/relation Id in a reverse order (the ones with the |
| 126 | * bigger |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 127 | * element/relation Id first). |
| 128 | * |
| 129 | * @throws IOException |
| 130 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 131 | private void setCandidateList () throws IOException { |
| Eliza Margaretha | 38a9466 | 2014-11-20 13:48:00 +0000 | [diff] [blame] | 132 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 133 | while (hasMoreSpans && firstSpans.doc() == currentDoc |
| 134 | && firstSpans.start() == currentPosition) { |
| Eliza Margaretha | 38a9466 | 2014-11-20 13:48:00 +0000 | [diff] [blame] | 135 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 136 | candidateList.add(createCandidateSpan()); |
| 137 | hasMoreSpans = firstSpans.next(); |
| 138 | } |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 139 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 140 | Collections.sort(candidateList); |
| 141 | Collections.reverse(candidateList); |
| 142 | } |
| Eliza Margaretha | 997ccde | 2014-07-04 09:20:35 +0000 | [diff] [blame] | 143 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 144 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 145 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 146 | * Creates a CandidateAttributeSpan based on the child span and |
| 147 | * set the |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 148 | * spanId and elementEnd from its payloads. |
| 149 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 150 | * @param firstSpans |
| 151 | * an AttributeSpans |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 152 | * @return a CandidateAttributeSpan |
| 153 | * @throws IOException |
| 154 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 155 | private CandidateAttributeSpan createCandidateSpan () throws IOException { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 156 | List<byte[]> payload = (List<byte[]>) firstSpans.getPayload(); |
| 157 | ByteBuffer wrapper = ByteBuffer.wrap(payload.get(0)); |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 158 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 159 | short spanId; |
| 160 | int start = 0, end; |
| Eliza Margaretha | 2db5e23 | 2015-03-04 10:20:01 +0000 | [diff] [blame] | 161 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 162 | if (payload.get(0).length == 6) { |
| 163 | end = wrapper.getInt(0); |
| 164 | spanId = wrapper.getShort(4); |
| 165 | return new CandidateAttributeSpan(firstSpans, spanId, end); |
| 166 | } |
| 167 | else if (payload.get(0).length == 10) { |
| margaretha | 9d0f76a | 2015-03-19 10:10:39 +0100 | [diff] [blame^] | 168 | start = wrapper.getInt(0); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 169 | end = wrapper.getInt(4); |
| 170 | spanId = wrapper.getShort(8); |
| 171 | return new CandidateAttributeSpan(firstSpans, spanId, start, end); |
| 172 | } |
| 173 | |
| 174 | throw new NullPointerException("Missing element end in payloads."); |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 175 | } |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 176 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 177 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 178 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 179 | * Tells if the enumeration of the AttributeSpans has come to an |
| 180 | * end. |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 181 | * |
| 182 | * @return true if the enumeration has finished. |
| 183 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 184 | public boolean isFinish () { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 185 | return isFinish; |
| 186 | } |
| 187 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 188 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 189 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 190 | * Sets true if the enumeration of the AttributeSpans has come to |
| 191 | * an end. |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 192 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 193 | * @param isFinish |
| 194 | * <code>true</code> if the enumeration of the |
| 195 | * AttributeSpans has come to an end, |
| 196 | * <code>false</code> otherwise. |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 197 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 198 | public void setFinish (boolean isFinish) { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 199 | this.isFinish = isFinish; |
| 200 | } |
| 201 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 202 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 203 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 204 | public boolean skipTo (int target) throws IOException { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 205 | if (hasMoreSpans && (firstSpans.doc() < target)) { |
| 206 | if (!firstSpans.skipTo(target)) { |
| 207 | candidateList.clear(); |
| 208 | return false; |
| 209 | } |
| 210 | } |
| 211 | setCandidateList(); |
| 212 | matchPayload.clear(); |
| 213 | isStartEnumeration = false; |
| 214 | return advance(); |
| 215 | } |
| 216 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 217 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 218 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 219 | public long cost () { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 220 | return firstSpans.cost(); |
| 221 | } |
| 222 | |
| 223 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 224 | * CandidateAttributeSpan contains information about an Attribute |
| 225 | * span. All |
| 226 | * attribute spans occurring in an identical position are |
| 227 | * collected as |
| 228 | * CandidateAttributeSpans. The list of these |
| 229 | * CandidateAttributeSpans are |
| 230 | * sorted based on the span ids to which the attributes belong to. |
| 231 | * The |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 232 | * attributes with smaller spanIds come first on the list. |
| 233 | * |
| 234 | * */ |
| Eliza Margaretha | c8d5920 | 2014-12-16 16:21:16 +0000 | [diff] [blame] | 235 | class CandidateAttributeSpan extends CandidateSpan implements |
| 236 | Comparable<CandidateSpan> { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 237 | |
| 238 | private short spanId; |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 239 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 240 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 241 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 242 | * Construct a CandidateAttributeSpan based on the given span, |
| 243 | * spanId, |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 244 | * and elementEnd. |
| 245 | * |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 246 | * @param span |
| 247 | * an AttributeSpans |
| 248 | * @param spanId |
| 249 | * the element or relation span id to which the |
| 250 | * current |
| 251 | * state of the specified AttributeSpans belongs |
| 252 | * to. |
| 253 | * @param elementEnd |
| 254 | * the end position of the element or relation span |
| 255 | * to |
| 256 | * which the current state of the specified |
| 257 | * AttributeSpans |
| 258 | * belongs to. |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 259 | * @throws IOException |
| 260 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 261 | public CandidateAttributeSpan (Spans span, short spanId, int elementEnd) |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 262 | throws IOException { |
| 263 | super(span); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 264 | setSpanId(spanId); |
| 265 | this.end = elementEnd; |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 266 | } |
| 267 | |
| Eliza Margaretha | 2db5e23 | 2015-03-04 10:20:01 +0000 | [diff] [blame] | 268 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 269 | public CandidateAttributeSpan (Spans span, short spanId, int start, |
| 270 | int end) throws IOException { |
| 271 | super(span); |
| 272 | setSpanId(spanId); |
| 273 | this.start = start; |
| 274 | this.end = end; |
| 275 | } |
| 276 | |
| 277 | |
| 278 | public void setSpanId (short spanId) { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 279 | this.spanId = spanId; |
| 280 | } |
| 281 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 282 | |
| 283 | public short getSpanId () { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 284 | return spanId; |
| 285 | } |
| 286 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 287 | |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 288 | @Override |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 289 | public int compareTo (CandidateSpan o) { |
| Eliza Margaretha | 6a2e80b | 2014-12-02 17:03:23 +0000 | [diff] [blame] | 290 | CandidateAttributeSpan cs = (CandidateAttributeSpan) o; |
| 291 | if (this.spanId == cs.spanId) |
| 292 | return 0; |
| 293 | else if (this.spanId > cs.spanId) |
| 294 | return 1; |
| 295 | return -1; |
| 296 | } |
| 297 | } |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 298 | } |