| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 3 | import java.io.IOException; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 4 | import java.nio.ByteBuffer; |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 5 | import java.util.ArrayList; |
| 6 | import java.util.Collections; |
| 7 | import java.util.List; |
| 8 | import java.util.Map; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 9 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 10 | import org.apache.lucene.index.AtomicReaderContext; |
| 11 | import org.apache.lucene.index.Term; |
| 12 | import org.apache.lucene.index.TermContext; |
| 13 | import org.apache.lucene.search.spans.Spans; |
| 14 | import org.apache.lucene.search.spans.TermSpans; |
| 15 | import org.apache.lucene.util.Bits; |
| 16 | import org.apache.lucene.util.BytesRef; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 17 | import org.slf4j.Logger; |
| 18 | import org.slf4j.LoggerFactory; |
| 19 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 20 | import de.ids_mannheim.korap.query.SpanElementQuery; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 21 | |
| Eliza Margaretha | 1c3bf27 | 2014-06-11 11:50:39 +0000 | [diff] [blame] | 22 | /** |
| 23 | * @author Nils Diewald, margaretha |
| Nils Diewald | 6802acd | 2014-03-18 18:29:30 +0000 | [diff] [blame] | 24 | * |
| Nils Diewald | 20607ab | 2014-03-20 23:28:36 +0000 | [diff] [blame] | 25 | * Use copyFrom instead of clone |
| Nils Diewald | 82a4b86 | 2014-02-20 21:17:41 +0000 | [diff] [blame] | 26 | */ |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 27 | public class ElementSpans extends SimpleSpans { |
| Nils Diewald | 82a4b86 | 2014-02-20 21:17:41 +0000 | [diff] [blame] | 28 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 29 | private List<CandidateElementSpans> candidateList; |
| 30 | private int currentDoc, currentPosition; |
| 31 | private short elementRef; |
| 32 | private TermSpans termSpans; |
| Nils Diewald | 20607ab | 2014-03-20 23:28:36 +0000 | [diff] [blame] | 33 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 34 | public boolean isElementRef = false; // A dummy flag |
| Nils Diewald | 20607ab | 2014-03-20 23:28:36 +0000 | [diff] [blame] | 35 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 36 | protected Logger logger = LoggerFactory.getLogger(AttributeSpans.class); |
| Nils Diewald | 20607ab | 2014-03-20 23:28:36 +0000 | [diff] [blame] | 37 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 38 | public ElementSpans(SpanElementQuery spanElementQuery, |
| 39 | AtomicReaderContext context, Bits acceptDocs, |
| 40 | Map<Term, TermContext> termContexts) throws IOException { |
| 41 | super(spanElementQuery, context, acceptDocs, termContexts); |
| 42 | candidateList = new ArrayList<>(); |
| 43 | termSpans = (TermSpans) firstSpans; |
| 44 | hasMoreSpans = termSpans.next(); |
| 45 | if (hasMoreSpans) { |
| 46 | currentDoc = termSpans.doc(); |
| 47 | currentPosition = termSpans.start(); |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 48 | } |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 49 | } |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 50 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 51 | @Override |
| 52 | public boolean next() throws IOException { |
| 53 | isStartEnumeration=false; |
| 54 | return advance(); |
| 55 | } |
| 56 | |
| 57 | /** Get the next match by first checking the candidate match list |
| 58 | * and setting the list when it is empty. |
| 59 | * */ |
| 60 | private boolean advance() throws IOException { |
| 61 | while(hasMoreSpans || !candidateList.isEmpty()){ |
| 62 | if (!candidateList.isEmpty()){ |
| 63 | CandidateElementSpans cs = candidateList.get(0); |
| 64 | this.matchDocNumber = cs.getDoc(); |
| 65 | this.matchStartPosition = cs.getStart(); |
| 66 | this.matchEndPosition = cs.getEnd(); |
| 67 | this.matchPayload = cs.getPayloads(); |
| 68 | this.setElementRef(cs.getElementRef()); |
| 69 | candidateList.remove(0); |
| 70 | return true; |
| 71 | } |
| 72 | else{ |
| 73 | logger.info("Setting candidate list"); |
| 74 | setCandidateList(); |
| 75 | currentDoc = termSpans.doc(); |
| 76 | currentPosition = termSpans.start(); |
| 77 | } |
| 78 | } |
| 79 | return false; |
| 80 | } |
| 81 | |
| 82 | /** Collect all the elements in the same start position and sort them by |
| 83 | * end position (smallest first). |
| 84 | * */ |
| 85 | private void setCandidateList() throws IOException { |
| 86 | while (hasMoreSpans && termSpans.doc() == currentDoc && |
| 87 | termSpans.start() == currentPosition){ |
| 88 | CandidateElementSpans cs = new CandidateElementSpans(termSpans, |
| 89 | elementRef); |
| 90 | readPayload(cs); |
| 91 | candidateList.add(cs); |
| 92 | hasMoreSpans = termSpans.next(); |
| 93 | } |
| 94 | Collections.sort(candidateList); |
| 95 | } |
| 96 | |
| 97 | |
| 98 | /** This method reads the payload of the termSpan and assigns the end |
| 99 | * position and element ref to the candidate match. The character offset |
| 100 | * payload is set as the candidate match payload. |
| 101 | * <br/><br/> |
| 102 | * <em>Note</em>: payloadbuffer should actually collects all other payload |
| 103 | * beside end position and element ref, but KorapIndex identify element's |
| 104 | * payload by its length (8), which is only the character offsets. So |
| 105 | * these offsets are directly set as the candidate match payload. |
| 106 | * |
| 107 | * @author margaretha |
| 108 | * */ |
| 109 | private void readPayload(CandidateElementSpans cs) throws IOException { |
| 110 | BytesRef payload = termSpans.getPostings().getPayload(); |
| 111 | //ByteBuffer payloadBuffer = ByteBuffer.allocate(128); |
| 112 | |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 113 | if (payload != null) { |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 114 | // Copy some payloads like start character and end character |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 115 | //payloadBuffer.put(payload.bytes, payload.offset, 8); |
| 116 | |
| 117 | cs.setEnd(readEndPostion(payload)); |
| Eliza Margaretha | 1c3bf27 | 2014-06-11 11:50:39 +0000 | [diff] [blame] | 118 | |
| 119 | if (isElementRef ){ |
| 120 | // Copy rest of payloads after the end position and elementref |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 121 | //payloadBuffer.put(payload.bytes, payload.offset + 14, payload.length - 14); |
| 122 | cs.setElementRef(readElementRef(payload)); |
| Eliza Margaretha | 1c3bf27 | 2014-06-11 11:50:39 +0000 | [diff] [blame] | 123 | } |
| 124 | else{ |
| 125 | // Copy rest of payloads after the end position |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 126 | //payloadBuffer.put(payload.bytes, payload.offset + 12, payload.length - 12); |
| 127 | cs.setElementRef((short) -1); |
| Eliza Margaretha | 1c3bf27 | 2014-06-11 11:50:39 +0000 | [diff] [blame] | 128 | } |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 129 | |
| 130 | //byte[] offsetCharacters = new byte[8]; |
| 131 | //System.arraycopy(payloadBuffer.array(), 0, offsetCharacters, 0, 8); |
| 132 | |
| 133 | cs.setPayloads(Collections.singletonList(readOffset(payload))); |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 134 | } |
| 135 | else { |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 136 | cs.setEnd(cs.getStart()); |
| 137 | cs.setElementRef((short) -1); |
| 138 | cs.setPayloads(null); |
| 139 | } |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 140 | } |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 141 | |
| 142 | |
| 143 | /** Get the offset bytes from the payload. |
| 144 | * */ |
| 145 | private byte[] readOffset(BytesRef payload){ |
| 146 | byte[] b = new byte[8]; |
| 147 | System.arraycopy(payload.bytes, payload.offset, b, 0, 8); |
| 148 | return b; |
| 149 | } |
| 150 | |
| 151 | /** Get the end position bytes from the payload and cast it to int. |
| 152 | * */ |
| 153 | private int readEndPostion(BytesRef payload) { |
| 154 | byte[] b = new byte[4]; |
| 155 | System.arraycopy(payload.bytes, payload.offset + 8, b, 0, 4); |
| 156 | return ByteBuffer.wrap(b).getInt(); |
| 157 | } |
| 158 | |
| 159 | /** Get the elementRef bytes from the payload and cast it into short. |
| 160 | * */ |
| 161 | private short readElementRef(BytesRef payload) { |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 162 | byte[] b = new byte[2]; |
| 163 | System.arraycopy(payload.bytes, payload.offset + 12, b, 0, 2); |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 164 | return ByteBuffer.wrap(b).getShort(); |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 165 | } |
| 166 | |
| 167 | @Override |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 168 | public boolean skipTo(int target) throws IOException { |
| 169 | if (hasMoreSpans && (firstSpans.doc() < target)){ |
| 170 | if (!firstSpans.skipTo(target)){ |
| 171 | candidateList.clear(); |
| 172 | return false; |
| 173 | } |
| 174 | } |
| 175 | setCandidateList(); |
| 176 | matchPayload.clear(); |
| 177 | isStartEnumeration=false; |
| 178 | return advance(); |
| 179 | } |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 180 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 181 | @Override |
| 182 | public long cost() { |
| 183 | return termSpans.cost(); |
| 184 | } |
| Nils Diewald | 20607ab | 2014-03-20 23:28:36 +0000 | [diff] [blame] | 185 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 186 | public short getElementRef() { |
| 187 | return elementRef; |
| 188 | } |
| Nils Diewald | 20607ab | 2014-03-20 23:28:36 +0000 | [diff] [blame] | 189 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 190 | public void setElementRef(short elementRef) { |
| 191 | this.elementRef = elementRef; |
| 192 | } |
| Nils Diewald | 20607ab | 2014-03-20 23:28:36 +0000 | [diff] [blame] | 193 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 194 | /** Match candidate for element spans. |
| 195 | * */ |
| Eliza Margaretha | e7938d3 | 2014-07-29 12:12:15 +0000 | [diff] [blame] | 196 | class CandidateElementSpans extends CandidateSpan { |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 197 | |
| 198 | private short elementRef; |
| 199 | |
| 200 | public CandidateElementSpans(Spans span, short elementRef) |
| 201 | throws IOException { |
| 202 | super(span); |
| 203 | setElementRef(elementRef); |
| 204 | } |
| 205 | |
| 206 | public void setElementRef(short elementRef) { |
| 207 | this.elementRef = elementRef; |
| 208 | } |
| 209 | public short getElementRef() { |
| 210 | return elementRef; |
| Eliza Margaretha | e7938d3 | 2014-07-29 12:12:15 +0000 | [diff] [blame] | 211 | } |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 212 | } |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 213 | }; |