| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| Eliza Margaretha | 0170b88 | 2014-10-29 15:49:31 +0000 | [diff] [blame] | 4 | import java.nio.ByteBuffer; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 5 | import java.util.ArrayList; |
| 6 | import java.util.Collections; |
| 7 | import java.util.List; |
| 8 | import java.util.Map; |
| 9 | |
| 10 | import org.apache.lucene.index.AtomicReaderContext; |
| 11 | import org.apache.lucene.index.Term; |
| 12 | import org.apache.lucene.index.TermContext; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 13 | import org.apache.lucene.search.spans.Spans; |
| 14 | import org.apache.lucene.search.spans.TermSpans; |
| 15 | import org.apache.lucene.util.Bits; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 16 | import org.slf4j.Logger; |
| 17 | import org.slf4j.LoggerFactory; |
| 18 | |
| 19 | import de.ids_mannheim.korap.query.SpanRelationQuery; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 20 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 21 | /** |
| 22 | * Enumeration of spans denoting relations between two tokens/elements. The |
| 23 | * start and end of a RelationSpan always denote the start and end of the |
| 24 | * left-side token/element. |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 25 | * |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 26 | * There are 4 types of relations, which is differentiated by the payload length |
| 27 | * in bytes. |
| 28 | * <ol> |
| 29 | * <li>Token to token relation (1 int & 3 short, length: 10)</li> |
| 30 | * <li>Token to span (2 int & 3 short, length: 14)</li> |
| 31 | * <li>Span to token (int, byte, int, 3 short, length: 15)</li> |
| 32 | * <li>Span to Span (3 int & 3 short, length: 18)</li> |
| 33 | * </ol> |
| 34 | * Every integer value denotes the start/end position of the start/target of a |
| 35 | * relation, in this format: (sourceEndPos?, startTargetPos, endTargetPos?). The |
| 36 | * end position of a token is identical to its start position, and therefore not |
| 37 | * is saved in a payload. |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 38 | * |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 39 | * The short values denote the relation id, left id, and right id. The byte in |
| 40 | * relation #3 is just a dummy to create a different length from the relation |
| 41 | * #2. |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 42 | * |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 43 | * NOTE: Sorting of the candidate spans can alternatively be done in indexing, |
| 44 | * instead of here. (first by left positions and then by right positions) |
| 45 | * |
| 46 | * @author margaretha |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 47 | * */ |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 48 | public class RelationSpans extends RelationBaseSpans { |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 49 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 50 | private int currentDoc, currentPosition; |
| 51 | private TermSpans relationTermSpan; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 52 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 53 | protected Logger logger = LoggerFactory.getLogger(RelationSpans.class); |
| 54 | private List<CandidateRelationSpan> candidateList; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 55 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 56 | /** |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 57 | * Constructs RelationSpans from the given {@link SpanRelationQuery}. |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 58 | * |
| 59 | * @param relationSpanQuery a SpanRelationQuery |
| 60 | * @param context |
| 61 | * @param acceptDocs |
| 62 | * @param termContexts |
| 63 | * @throws IOException |
| 64 | */ |
| 65 | public RelationSpans(SpanRelationQuery relationSpanQuery, |
| 66 | AtomicReaderContext context, Bits acceptDocs, |
| 67 | Map<Term, TermContext> termContexts) throws IOException { |
| 68 | super(relationSpanQuery, context, acceptDocs, termContexts); |
| 69 | candidateList = new ArrayList<>(); |
| 70 | relationTermSpan = (TermSpans) firstSpans; |
| 71 | hasMoreSpans = relationTermSpan.next(); |
| 72 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 73 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 74 | @Override |
| 75 | public boolean next() throws IOException { |
| 76 | isStartEnumeration = false; |
| 77 | return advance(); |
| 78 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 79 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 80 | /** |
| 81 | * Returns true if there is a next match by checking if the CandidateList is |
| 82 | * not empty and set the first element of the list as the next match. |
| 83 | * Otherwise, if the RelationSpan has not ended yet, try to set the |
| 84 | * CandidateList. |
| 85 | * |
| 86 | * @return true if there is a next match. |
| 87 | * @throws IOException |
| 88 | */ |
| 89 | private boolean advance() throws IOException { |
| 90 | while (hasMoreSpans || !candidateList.isEmpty()) { |
| 91 | if (!candidateList.isEmpty()) { |
| 92 | CandidateRelationSpan cs = candidateList.get(0); |
| 93 | this.matchDocNumber = cs.getDoc(); |
| 94 | this.matchStartPosition = cs.getStart(); |
| 95 | this.matchEndPosition = cs.getEnd(); |
| 96 | this.setRightStart(cs.getRightStart()); |
| 97 | this.setRightEnd(cs.getRightEnd()); |
| 98 | this.spanId = cs.getSpanId(); // relation id |
| 99 | this.leftId = cs.getLeftId(); |
| 100 | this.rightId = cs.getRightId(); |
| 101 | candidateList.remove(0); |
| 102 | return true; |
| 103 | } else { |
| 104 | setCandidateList(); |
| 105 | currentDoc = relationTermSpan.doc(); |
| 106 | currentPosition = relationTermSpan.start(); |
| 107 | } |
| 108 | } |
| 109 | return false; |
| 110 | } |
| Eliza Margaretha | 3e50bc4 | 2014-10-22 15:29:15 +0000 | [diff] [blame] | 111 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 112 | /** |
| 113 | * Setting the CandidateList by adding all relationTermSpan whose start |
| 114 | * position is the same as the current span position, and sort the |
| 115 | * candidateList. |
| 116 | * |
| 117 | * @throws IOException |
| 118 | */ |
| 119 | private void setCandidateList() throws IOException { |
| 120 | while (hasMoreSpans && relationTermSpan.doc() == currentDoc |
| 121 | && relationTermSpan.start() == currentPosition) { |
| 122 | CandidateRelationSpan cs = new CandidateRelationSpan( |
| 123 | relationTermSpan); |
| 124 | readPayload(cs); |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 125 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 126 | candidateList.add(cs); |
| 127 | hasMoreSpans = relationTermSpan.next(); |
| 128 | } |
| 129 | Collections.sort(candidateList); |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 130 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 131 | // for (CandidateRelationSpan cs:candidateList){ |
| 132 | // System.out.println(cs.getStart()+","+cs.getEnd() //+" <size:" +payload.get(0).length |
| 133 | // +" target "+cs.getRightStart()+","+cs.getRightEnd() +" id:"+cs.getSpanId()); |
| 134 | // } |
| 135 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 136 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 137 | /** |
| 138 | * Identify the relation type of the given {@link CandidateRelationSpan} by |
| 139 | * checking the length of its payloads, and set some properties of the span |
| 140 | * based on the payloads. |
| 141 | * |
| 142 | * @param cs a CandidateRelationSpan |
| 143 | */ |
| 144 | private void readPayload(CandidateRelationSpan cs) { |
| 145 | List<byte[]> payload = (List<byte[]>) cs.getPayloads(); |
| 146 | int length = payload.get(0).length; |
| 147 | ByteBuffer bb = ByteBuffer.allocate(length); |
| 148 | bb.put(payload.get(0)); |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 149 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 150 | int i; |
| 151 | switch (length) { |
| 152 | case 10: // Token to token |
| 153 | i = bb.getInt(0); |
| 154 | cs.setRightStart(i - 1); |
| 155 | cs.setRightEnd(i); |
| 156 | break; |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 157 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 158 | case 14: // Token to span |
| 159 | cs.setRightStart(bb.getInt(0)); |
| 160 | cs.setRightEnd(bb.getInt(4)); |
| 161 | break; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 162 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 163 | case 15: // Span to token |
| 164 | cs.setEnd(bb.getInt(0)); |
| 165 | i = bb.getInt(5); |
| 166 | cs.setRightStart(i - 1); |
| 167 | cs.setRightEnd(i); |
| 168 | break; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 169 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 170 | case 18: // Span to span |
| 171 | cs.setEnd(bb.getInt(0)); |
| 172 | cs.setRightStart(bb.getInt(4)); |
| 173 | cs.setRightEnd(bb.getInt(8)); |
| 174 | break; |
| 175 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 176 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 177 | cs.setRightId(bb.getShort(length - 2)); //right id |
| 178 | cs.setLeftId(bb.getShort(length - 4)); //left id |
| 179 | cs.setSpanId(bb.getShort(length - 6)); //relation id |
| 180 | // Payload is cleared. |
| 181 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 182 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 183 | @Override |
| 184 | public boolean skipTo(int target) throws IOException { |
| 185 | if (hasMoreSpans && (firstSpans.doc() < target)) { |
| 186 | if (!firstSpans.skipTo(target)) { |
| 187 | candidateList.clear(); |
| 188 | return false; |
| 189 | } |
| 190 | } |
| 191 | setCandidateList(); |
| 192 | matchPayload.clear(); |
| 193 | isStartEnumeration = false; |
| 194 | return advance(); |
| 195 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 196 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 197 | @Override |
| 198 | public long cost() { |
| 199 | return firstSpans.cost(); |
| 200 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 201 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 202 | /** |
| 203 | * Returns the right start position of the current RelationSpan. |
| 204 | * |
| 205 | * @return the right start position of the current RelationSpan. |
| 206 | */ |
| 207 | public int getRightStart() { |
| 208 | return rightStart; |
| 209 | } |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 210 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 211 | /** |
| 212 | * Sets the right start position of the current RelationSpan. |
| 213 | * |
| 214 | * @param rightStart the right start position of the current RelationSpan |
| 215 | */ |
| 216 | public void setRightStart(int rightStart) { |
| 217 | this.rightStart = rightStart; |
| 218 | } |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 219 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 220 | /** |
| 221 | * Returns the right end position of the current RelationSpan. |
| 222 | * |
| 223 | * @return the right end position of the current RelationSpan. |
| 224 | */ |
| 225 | public int getRightEnd() { |
| 226 | return rightEnd; |
| 227 | } |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 228 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 229 | /** |
| 230 | * Sets the right end position of the current RelationSpan. |
| 231 | * |
| 232 | * @param rightEnd the right end position of the current RelationSpan. |
| 233 | */ |
| 234 | public void setRightEnd(int rightEnd) { |
| 235 | this.rightEnd = rightEnd; |
| 236 | } |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 237 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 238 | /** |
| 239 | * CandidateRelationSpan stores a state of RelationSpans. In a list, |
| 240 | * CandidateRelationSpans are ordered first by the position of the relation |
| 241 | * left side and then by the position of the relation right side. |
| 242 | */ |
| 243 | class CandidateRelationSpan extends CandidateSpan implements |
| 244 | Comparable<CandidateSpan> { |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 245 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 246 | private int rightStart, rightEnd; |
| 247 | private short leftId, rightId; |
| 248 | |
| 249 | public CandidateRelationSpan(Spans span) throws IOException { |
| 250 | super(span); |
| 251 | } |
| 252 | |
| 253 | @Override |
| 254 | public int compareTo(CandidateSpan o) { |
| 255 | |
| 256 | int sourcePositionComparison = super.compareTo(o); |
| 257 | |
| 258 | CandidateRelationSpan cs = (CandidateRelationSpan) o; |
| 259 | if (sourcePositionComparison == 0) { |
| 260 | if (this.getRightStart() == cs.getRightStart()) { |
| 261 | if (this.getRightEnd() == cs.getRightEnd()) |
| 262 | return 0; |
| 263 | if (this.getRightEnd() > cs.getRightEnd()) |
| 264 | return 1; |
| 265 | else |
| 266 | return -1; |
| 267 | } else if (this.getRightStart() < cs.getRightStart()) |
| 268 | return -1; |
| 269 | else |
| 270 | return 1; |
| 271 | } |
| 272 | |
| 273 | return sourcePositionComparison; |
| 274 | } |
| 275 | |
| 276 | public int getRightEnd() { |
| 277 | return rightEnd; |
| 278 | } |
| 279 | |
| 280 | public void setRightEnd(int rightEnd) { |
| 281 | this.rightEnd = rightEnd; |
| 282 | } |
| 283 | |
| 284 | public int getRightStart() { |
| 285 | return rightStart; |
| 286 | } |
| 287 | |
| 288 | public void setRightStart(int rightStart) { |
| 289 | this.rightStart = rightStart; |
| 290 | } |
| 291 | |
| 292 | public short getLeftId() { |
| 293 | return leftId; |
| 294 | } |
| 295 | |
| 296 | public void setLeftId(short leftId) { |
| 297 | this.leftId = leftId; |
| 298 | } |
| 299 | |
| 300 | public short getRightId() { |
| 301 | return rightId; |
| 302 | } |
| 303 | |
| 304 | public void setRightId(short rightId) { |
| 305 | this.rightId = rightId; |
| 306 | } |
| 307 | |
| 308 | } |
| 309 | |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 310 | } |