| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| 3 | import java.io.IOException; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 4 | import java.util.ArrayList; |
| 5 | import java.util.Collections; |
| 6 | import java.util.List; |
| 7 | import java.util.Map; |
| 8 | |
| 9 | import org.apache.lucene.index.AtomicReaderContext; |
| 10 | import org.apache.lucene.index.Term; |
| 11 | import org.apache.lucene.index.TermContext; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 12 | import org.apache.lucene.search.spans.Spans; |
| 13 | import org.apache.lucene.search.spans.TermSpans; |
| 14 | import org.apache.lucene.util.Bits; |
| 15 | import org.apache.lucene.util.BytesRef; |
| 16 | import org.slf4j.Logger; |
| 17 | import org.slf4j.LoggerFactory; |
| 18 | |
| 19 | import de.ids_mannheim.korap.query.SpanRelationQuery; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 20 | |
| 21 | /** Enumeration of spans denoting relations between two tokens/elements. The start and end of |
| 22 | * a RelationSpan always denote the start and end of the source token/element. |
| 23 | * |
| 24 | * There are 4 types of relations, which is differentiated by the payload length in bytes. |
| 25 | * 1. Token to token relation (1 int & 1 short, length: 6) |
| 26 | * 2. Token to span (2 int & 1 short, length: 10) |
| 27 | * 3. Span to token (int, byte, int, short, length: 11) |
| Eliza Margaretha | 51fd5c2 | 2014-10-14 13:12:33 +0000 | [diff] [blame] | 28 | * 4. Span to Span (3 int & 1 short, length: 14) |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 29 | * |
| 30 | * Every integer value denotes the start/end position of the start/target of a relation, |
| 31 | * in this format: (sourceEndPos?, startTargetPos, endTargetPos?). The end position of a token is |
| 32 | * identical to its start position, and therefore not is saved in a payload. |
| 33 | * |
| 34 | * A short value denote the relation id, used for matching relation-attributes. |
| 35 | * The byte in relation #3 is just a dummy to create a different length from the relation #2. |
| 36 | * |
| 37 | * NOTE: Sorting of the candidate spans can alternatively be done in indexing, instead of here. |
| 38 | * |
| 39 | * @author margaretha |
| 40 | * */ |
| Eliza Margaretha | f461127 | 2014-10-16 08:45:33 +0000 | [diff] [blame] | 41 | public class RelationSpans extends SpansWithId{ |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 42 | |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 43 | //short relationId; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 44 | int targetStart, targetEnd; |
| 45 | int currentDoc, currentPosition; |
| 46 | |
| 47 | private TermSpans relationTermSpan; |
| 48 | |
| 49 | protected Logger logger = LoggerFactory.getLogger(RelationSpans.class); |
| 50 | private List<CandidateRelationSpan> candidateList; |
| 51 | |
| 52 | public RelationSpans(SpanRelationQuery relationSpanQuery, |
| 53 | AtomicReaderContext context, Bits acceptDocs, |
| 54 | Map<Term, TermContext> termContexts) throws IOException { |
| 55 | super(relationSpanQuery, context, acceptDocs, termContexts); |
| 56 | candidateList = new ArrayList<>(); |
| 57 | relationTermSpan = (TermSpans) firstSpans; |
| 58 | hasMoreSpans = relationTermSpan.next(); |
| 59 | } |
| 60 | |
| 61 | @Override |
| 62 | public boolean next() throws IOException { |
| Eliza Margaretha | 51fd5c2 | 2014-10-14 13:12:33 +0000 | [diff] [blame] | 63 | isStartEnumeration=false; |
| 64 | return advance(); |
| 65 | } |
| 66 | |
| 67 | private boolean advance() throws IOException{ |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 68 | while(hasMoreSpans || !candidateList.isEmpty()){ |
| 69 | if (!candidateList.isEmpty()){ |
| 70 | CandidateRelationSpan cs = candidateList.get(0); |
| 71 | this.matchDocNumber = cs.getDoc(); |
| 72 | this.matchStartPosition = cs.getStart(); |
| 73 | this.matchEndPosition = cs.getEnd(); |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 74 | this.matchPayload = cs.getPayloads(); |
| 75 | this.spanId = cs.getSpanId(); // relation id |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 76 | candidateList.remove(0); |
| 77 | return true; |
| 78 | } |
| 79 | else{ |
| 80 | setCandidateList(); |
| 81 | currentDoc = relationTermSpan.doc(); |
| 82 | currentPosition = relationTermSpan.start(); |
| 83 | } |
| 84 | } |
| 85 | return false; |
| 86 | } |
| 87 | |
| 88 | private void setCandidateList() throws IOException { |
| 89 | while (hasMoreSpans && relationTermSpan.doc() == currentDoc && |
| 90 | relationTermSpan.start() == currentPosition){ |
| 91 | CandidateRelationSpan cs = new CandidateRelationSpan(relationTermSpan); |
| 92 | readPayload(cs); |
| 93 | |
| 94 | candidateList.add(cs); |
| 95 | hasMoreSpans = relationTermSpan.next(); |
| 96 | } |
| 97 | Collections.sort(candidateList); |
| 98 | |
| 99 | /*for (CandidateRelationSpan cs:candidateList){ |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 100 | System.out.println(cs.getStart()+","+cs.getEnd() //+" <size:" +payload.get(0).length |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 101 | +" target "+cs.getTargetStart()+","+cs.getTargetEnd() +" id:"+cs.getRelationId()); |
| 102 | }*/ |
| 103 | } |
| 104 | |
| 105 | private void readPayload(CandidateRelationSpan cs) { |
| 106 | List<byte[]> payload = (List<byte[]>) cs.getPayloads(); |
| 107 | int length = payload.get(0).length; |
| 108 | BytesRef payloadBytesRef = new BytesRef(payload.get(0)); |
| 109 | |
| 110 | int i; |
| 111 | |
| 112 | switch (length) { |
| 113 | case 6: // Token to token |
| 114 | i = PayloadReader.readInteger(payloadBytesRef,0); |
| 115 | cs.setTargetStart(i); |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 116 | cs.setTargetEnd(i); |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 117 | break; |
| 118 | |
| 119 | case 10: // Token to span |
| 120 | cs.setTargetStart(PayloadReader.readInteger(payloadBytesRef,0)); |
| 121 | cs.setTargetEnd(PayloadReader.readInteger(payloadBytesRef,4)); |
| 122 | break; |
| 123 | |
| 124 | case 11: // Span to token |
| 125 | cs.setEnd(PayloadReader.readInteger(payloadBytesRef,0)); |
| 126 | i = PayloadReader.readInteger(payloadBytesRef,5); |
| 127 | cs.setTargetStart(i); |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 128 | cs.setTargetEnd(i); |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 129 | break; |
| 130 | |
| Eliza Margaretha | 51fd5c2 | 2014-10-14 13:12:33 +0000 | [diff] [blame] | 131 | case 14: // Span to span |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 132 | cs.setEnd(PayloadReader.readInteger(payloadBytesRef,0)); |
| 133 | cs.setTargetStart(PayloadReader.readInteger(payloadBytesRef,4)); |
| 134 | cs.setTargetEnd(PayloadReader.readInteger(payloadBytesRef,8)); |
| 135 | break; |
| 136 | } |
| 137 | |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 138 | cs.setSpanId(PayloadReader.readShort(payloadBytesRef, length-2)); //relation id |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 139 | } |
| 140 | |
| 141 | @Override |
| Eliza Margaretha | 51fd5c2 | 2014-10-14 13:12:33 +0000 | [diff] [blame] | 142 | public boolean skipTo(int target) throws IOException { |
| 143 | if (hasMoreSpans && (firstSpans.doc() < target)){ |
| 144 | if (!firstSpans.skipTo(target)){ |
| 145 | candidateList.clear(); |
| 146 | return false; |
| 147 | } |
| 148 | } |
| 149 | setCandidateList(); |
| 150 | matchPayload.clear(); |
| 151 | isStartEnumeration=false; |
| 152 | return advance(); |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 153 | } |
| 154 | |
| 155 | @Override |
| 156 | public long cost() { |
| Eliza Margaretha | 51fd5c2 | 2014-10-14 13:12:33 +0000 | [diff] [blame] | 157 | return firstSpans.cost(); |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 158 | } |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 159 | /* |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 160 | public short getRelationId() { |
| 161 | return relationId; |
| 162 | } |
| 163 | |
| 164 | public void setRelationId(short relationId) { |
| 165 | this.relationId = relationId; |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 166 | }*/ |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 167 | |
| 168 | public int getTargetStart() { |
| 169 | return targetStart; |
| 170 | } |
| 171 | |
| 172 | public void setTargetStart(int targetStart) { |
| 173 | this.targetStart = targetStart; |
| 174 | } |
| 175 | |
| 176 | public int getTargetEnd() { |
| 177 | return targetEnd; |
| 178 | } |
| 179 | |
| 180 | public void setTargetEnd(int targetEnd) { |
| 181 | this.targetEnd = targetEnd; |
| 182 | } |
| 183 | |
| 184 | |
| 185 | class CandidateRelationSpan extends CandidateSpan implements Comparable<CandidateSpan>{ |
| 186 | |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 187 | private int targetStart, targetEnd; |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 188 | |
| 189 | public CandidateRelationSpan(Spans span) throws IOException{ |
| 190 | super(span); |
| 191 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 192 | |
| 193 | @Override |
| 194 | public int compareTo(CandidateSpan o) { |
| 195 | |
| 196 | int sourcePositionComparison = super.compareTo(o); |
| 197 | |
| 198 | CandidateRelationSpan cs = (CandidateRelationSpan) o; |
| 199 | if (sourcePositionComparison == 0){ |
| 200 | if (this.getTargetStart() == cs.getTargetStart()){ |
| 201 | if (this.getTargetEnd() == cs.getTargetEnd()) |
| 202 | return 0; |
| 203 | if (this.getTargetEnd() > cs.getTargetEnd() ) |
| 204 | return 1; |
| 205 | else return -1; |
| 206 | } |
| 207 | else if (this.getTargetStart() < cs.getTargetStart()) |
| 208 | return -1; |
| 209 | else return 1; |
| 210 | } |
| 211 | |
| 212 | return sourcePositionComparison; |
| 213 | } |
| 214 | |
| 215 | public int getTargetEnd() { |
| 216 | return targetEnd; |
| 217 | } |
| 218 | |
| 219 | public void setTargetEnd(int targetEnd) { |
| 220 | this.targetEnd = targetEnd; |
| 221 | } |
| 222 | |
| 223 | public int getTargetStart() { |
| 224 | return targetStart; |
| 225 | } |
| 226 | |
| 227 | public void setTargetStart(int targetStart) { |
| 228 | this.targetStart = targetStart; |
| 229 | } |
| Eliza Margaretha | f13b8ad | 2014-10-13 16:36:28 +0000 | [diff] [blame] | 230 | } |
| 231 | |
| 232 | } |