| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| Eliza Margaretha | 0170b88 | 2014-10-29 15:49:31 +0000 | [diff] [blame] | 4 | import java.nio.ByteBuffer; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 5 | import java.util.ArrayList; |
| 6 | import java.util.Collections; |
| 7 | import java.util.List; |
| 8 | import java.util.Map; |
| 9 | |
| 10 | import org.apache.lucene.index.AtomicReaderContext; |
| 11 | import org.apache.lucene.index.Term; |
| 12 | import org.apache.lucene.index.TermContext; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 13 | import org.apache.lucene.search.spans.Spans; |
| 14 | import org.apache.lucene.search.spans.TermSpans; |
| 15 | import org.apache.lucene.util.Bits; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 16 | import org.slf4j.Logger; |
| 17 | import org.slf4j.LoggerFactory; |
| 18 | |
| 19 | import de.ids_mannheim.korap.query.SpanRelationQuery; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 20 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 21 | /** |
| 22 | * Enumeration of spans denoting relations between two tokens/elements. The |
| 23 | * start and end of a RelationSpan always denote the start and end of the |
| 24 | * left-side token/element. |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 25 | * |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 26 | * There are 4 types of relations, which is differentiated by the payload length |
| 27 | * in bytes. |
| 28 | * <ol> |
| 29 | * <li>Token to token relation (1 int & 3 short, length: 10)</li> |
| 30 | * <li>Token to span (2 int & 3 short, length: 14)</li> |
| 31 | * <li>Span to token (int, byte, int, 3 short, length: 15)</li> |
| 32 | * <li>Span to Span (3 int & 3 short, length: 18)</li> |
| 33 | * </ol> |
| 34 | * Every integer value denotes the start/end position of the start/target of a |
| 35 | * relation, in this format: (sourceEndPos?, startTargetPos, endTargetPos?). The |
| 36 | * end position of a token is identical to its start position, and therefore not |
| 37 | * is saved in a payload. |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 38 | * |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 39 | * The short values denote the relation id, left id, and right id. The byte in |
| 40 | * relation #3 is just a dummy to create a different length from the relation |
| 41 | * #2. |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 42 | * |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 43 | * NOTE: Sorting of the candidate spans can alternatively be done in indexing, |
| 44 | * instead of here. (first by left positions and then by right positions) |
| 45 | * |
| 46 | * @author margaretha |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 47 | * */ |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 48 | public class RelationSpans extends RelationBaseSpans { |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 49 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 50 | private int currentDoc, currentPosition; |
| 51 | private TermSpans relationTermSpan; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 52 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 53 | protected Logger logger = LoggerFactory.getLogger(RelationSpans.class); |
| 54 | private List<CandidateRelationSpan> candidateList; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 55 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 56 | /** |
| Eliza Margaretha | 7612bde | 2015-01-14 10:28:42 +0000 | [diff] [blame] | 57 | * Constructs RelationSpans from the given {@link SpanRelationQuery}. |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 58 | * |
| 59 | * @param relationSpanQuery a SpanRelationQuery |
| 60 | * @param context |
| 61 | * @param acceptDocs |
| 62 | * @param termContexts |
| 63 | * @throws IOException |
| 64 | */ |
| 65 | public RelationSpans(SpanRelationQuery relationSpanQuery, |
| 66 | AtomicReaderContext context, Bits acceptDocs, |
| 67 | Map<Term, TermContext> termContexts) throws IOException { |
| 68 | super(relationSpanQuery, context, acceptDocs, termContexts); |
| 69 | candidateList = new ArrayList<>(); |
| 70 | relationTermSpan = (TermSpans) firstSpans; |
| 71 | hasMoreSpans = relationTermSpan.next(); |
| 72 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 73 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 74 | @Override |
| 75 | public boolean next() throws IOException { |
| 76 | isStartEnumeration = false; |
| 77 | return advance(); |
| 78 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 79 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 80 | /** |
| 81 | * Returns true if there is a next match by checking if the CandidateList is |
| 82 | * not empty and set the first element of the list as the next match. |
| 83 | * Otherwise, if the RelationSpan has not ended yet, try to set the |
| 84 | * CandidateList. |
| 85 | * |
| 86 | * @return true if there is a next match. |
| 87 | * @throws IOException |
| 88 | */ |
| 89 | private boolean advance() throws IOException { |
| 90 | while (hasMoreSpans || !candidateList.isEmpty()) { |
| 91 | if (!candidateList.isEmpty()) { |
| 92 | CandidateRelationSpan cs = candidateList.get(0); |
| 93 | this.matchDocNumber = cs.getDoc(); |
| 94 | this.matchStartPosition = cs.getStart(); |
| 95 | this.matchEndPosition = cs.getEnd(); |
| Eliza Margaretha | 2db5e23 | 2015-03-04 10:20:01 +0000 | [diff] [blame^] | 96 | this.matchPayload = cs.getPayloads(); |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 97 | this.setRightStart(cs.getRightStart()); |
| 98 | this.setRightEnd(cs.getRightEnd()); |
| 99 | this.spanId = cs.getSpanId(); // relation id |
| 100 | this.leftId = cs.getLeftId(); |
| 101 | this.rightId = cs.getRightId(); |
| 102 | candidateList.remove(0); |
| 103 | return true; |
| Eliza Margaretha | 2db5e23 | 2015-03-04 10:20:01 +0000 | [diff] [blame^] | 104 | } |
| 105 | else { |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 106 | setCandidateList(); |
| 107 | currentDoc = relationTermSpan.doc(); |
| 108 | currentPosition = relationTermSpan.start(); |
| 109 | } |
| 110 | } |
| 111 | return false; |
| 112 | } |
| Eliza Margaretha | 3e50bc4 | 2014-10-22 15:29:15 +0000 | [diff] [blame] | 113 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 114 | /** |
| 115 | * Setting the CandidateList by adding all relationTermSpan whose start |
| 116 | * position is the same as the current span position, and sort the |
| 117 | * candidateList. |
| 118 | * |
| 119 | * @throws IOException |
| 120 | */ |
| 121 | private void setCandidateList() throws IOException { |
| 122 | while (hasMoreSpans && relationTermSpan.doc() == currentDoc |
| 123 | && relationTermSpan.start() == currentPosition) { |
| Eliza Margaretha | 2db5e23 | 2015-03-04 10:20:01 +0000 | [diff] [blame^] | 124 | CandidateRelationSpan cs = new CandidateRelationSpan(relationTermSpan); |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 125 | readPayload(cs); |
| Eliza Margaretha | 2db5e23 | 2015-03-04 10:20:01 +0000 | [diff] [blame^] | 126 | setPayload(cs); |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 127 | candidateList.add(cs); |
| 128 | hasMoreSpans = relationTermSpan.next(); |
| 129 | } |
| 130 | Collections.sort(candidateList); |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 131 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 132 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 133 | /** |
| 134 | * Identify the relation type of the given {@link CandidateRelationSpan} by |
| 135 | * checking the length of its payloads, and set some properties of the span |
| 136 | * based on the payloads. |
| 137 | * |
| 138 | * @param cs a CandidateRelationSpan |
| 139 | */ |
| 140 | private void readPayload(CandidateRelationSpan cs) { |
| 141 | List<byte[]> payload = (List<byte[]>) cs.getPayloads(); |
| 142 | int length = payload.get(0).length; |
| 143 | ByteBuffer bb = ByteBuffer.allocate(length); |
| 144 | bb.put(payload.get(0)); |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 145 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 146 | int i; |
| 147 | switch (length) { |
| 148 | case 10: // Token to token |
| 149 | i = bb.getInt(0); |
| 150 | cs.setRightStart(i - 1); |
| 151 | cs.setRightEnd(i); |
| 152 | break; |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 153 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 154 | case 14: // Token to span |
| 155 | cs.setRightStart(bb.getInt(0)); |
| 156 | cs.setRightEnd(bb.getInt(4)); |
| 157 | break; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 158 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 159 | case 15: // Span to token |
| 160 | cs.setEnd(bb.getInt(0)); |
| 161 | i = bb.getInt(5); |
| 162 | cs.setRightStart(i - 1); |
| 163 | cs.setRightEnd(i); |
| 164 | break; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 165 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 166 | case 18: // Span to span |
| 167 | cs.setEnd(bb.getInt(0)); |
| 168 | cs.setRightStart(bb.getInt(4)); |
| 169 | cs.setRightEnd(bb.getInt(8)); |
| 170 | break; |
| 171 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 172 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 173 | cs.setRightId(bb.getShort(length - 2)); //right id |
| 174 | cs.setLeftId(bb.getShort(length - 4)); //left id |
| 175 | cs.setSpanId(bb.getShort(length - 6)); //relation id |
| 176 | // Payload is cleared. |
| 177 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 178 | |
| Eliza Margaretha | 2db5e23 | 2015-03-04 10:20:01 +0000 | [diff] [blame^] | 179 | private void setPayload(CandidateRelationSpan cs) throws IOException { |
| 180 | ArrayList<byte[]> payload = new ArrayList<byte[]>(); |
| 181 | if (relationTermSpan.isPayloadAvailable()) { |
| 182 | payload.addAll(relationTermSpan.getPayload()); |
| 183 | } |
| 184 | payload.add(createClassPayload(cs.getLeftStart(), cs.getLeftEnd(), |
| 185 | (byte) 1)); |
| 186 | payload.add(createClassPayload(cs.getRightStart(), cs.getRightEnd(), |
| 187 | (byte) 2)); |
| 188 | cs.setPayloads(payload); |
| 189 | } |
| 190 | |
| 191 | private byte[] createClassPayload(int start, int end, byte classNumber) { |
| 192 | ByteBuffer buffer = ByteBuffer.allocate(9); |
| 193 | buffer.putInt(start); |
| 194 | buffer.putInt(end); |
| 195 | buffer.put(classNumber); |
| 196 | return buffer.array(); |
| 197 | } |
| 198 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 199 | @Override |
| 200 | public boolean skipTo(int target) throws IOException { |
| 201 | if (hasMoreSpans && (firstSpans.doc() < target)) { |
| 202 | if (!firstSpans.skipTo(target)) { |
| 203 | candidateList.clear(); |
| 204 | return false; |
| 205 | } |
| 206 | } |
| 207 | setCandidateList(); |
| 208 | matchPayload.clear(); |
| 209 | isStartEnumeration = false; |
| 210 | return advance(); |
| 211 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 212 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 213 | @Override |
| 214 | public long cost() { |
| 215 | return firstSpans.cost(); |
| 216 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 217 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 218 | /** |
| 219 | * Returns the right start position of the current RelationSpan. |
| 220 | * |
| 221 | * @return the right start position of the current RelationSpan. |
| 222 | */ |
| 223 | public int getRightStart() { |
| 224 | return rightStart; |
| 225 | } |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 226 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 227 | /** |
| 228 | * Sets the right start position of the current RelationSpan. |
| 229 | * |
| 230 | * @param rightStart the right start position of the current RelationSpan |
| 231 | */ |
| 232 | public void setRightStart(int rightStart) { |
| 233 | this.rightStart = rightStart; |
| 234 | } |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 235 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 236 | /** |
| 237 | * Returns the right end position of the current RelationSpan. |
| 238 | * |
| 239 | * @return the right end position of the current RelationSpan. |
| 240 | */ |
| 241 | public int getRightEnd() { |
| 242 | return rightEnd; |
| 243 | } |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 244 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 245 | /** |
| 246 | * Sets the right end position of the current RelationSpan. |
| 247 | * |
| 248 | * @param rightEnd the right end position of the current RelationSpan. |
| 249 | */ |
| 250 | public void setRightEnd(int rightEnd) { |
| 251 | this.rightEnd = rightEnd; |
| 252 | } |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 253 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 254 | /** |
| Eliza Margaretha | 2db5e23 | 2015-03-04 10:20:01 +0000 | [diff] [blame^] | 255 | * CandidateRelationSpan stores a state of RelationSpans. In a list, |
| 256 | * CandidateRelationSpans are ordered first by the position of the relation |
| 257 | * left side. |
| 258 | */ |
| 259 | class CandidateRelationSpan extends CandidateSpan { |
| Eliza Margaretha | d12cabb | 2014-10-27 17:45:34 +0000 | [diff] [blame] | 260 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 261 | private int rightStart, rightEnd; |
| 262 | private short leftId, rightId; |
| 263 | |
| 264 | public CandidateRelationSpan(Spans span) throws IOException { |
| 265 | super(span); |
| 266 | } |
| 267 | |
| Eliza Margaretha | 493bfa9 | 2015-01-13 16:16:38 +0000 | [diff] [blame] | 268 | public int getRightEnd() { |
| 269 | return rightEnd; |
| 270 | } |
| 271 | |
| 272 | public void setRightEnd(int rightEnd) { |
| 273 | this.rightEnd = rightEnd; |
| 274 | } |
| 275 | |
| 276 | public int getRightStart() { |
| 277 | return rightStart; |
| 278 | } |
| 279 | |
| 280 | public void setRightStart(int rightStart) { |
| 281 | this.rightStart = rightStart; |
| 282 | } |
| 283 | |
| 284 | public short getLeftId() { |
| 285 | return leftId; |
| 286 | } |
| 287 | |
| 288 | public void setLeftId(short leftId) { |
| 289 | this.leftId = leftId; |
| 290 | } |
| 291 | |
| 292 | public short getRightId() { |
| 293 | return rightId; |
| 294 | } |
| 295 | |
| 296 | public void setRightId(short rightId) { |
| 297 | this.rightId = rightId; |
| 298 | } |
| 299 | |
| 300 | } |
| 301 | |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 302 | } |