| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.query.spans; |
| 2 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 3 | import java.io.IOException; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 4 | import java.nio.ByteBuffer; |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 5 | import java.util.ArrayList; |
| Eliza Margaretha | 23f9876 | 2014-10-30 17:34:47 +0000 | [diff] [blame] | 6 | import java.util.Arrays; |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 7 | import java.util.Collections; |
| 8 | import java.util.List; |
| 9 | import java.util.Map; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 10 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 11 | import org.apache.lucene.index.AtomicReaderContext; |
| 12 | import org.apache.lucene.index.Term; |
| 13 | import org.apache.lucene.index.TermContext; |
| 14 | import org.apache.lucene.search.spans.Spans; |
| 15 | import org.apache.lucene.search.spans.TermSpans; |
| 16 | import org.apache.lucene.util.Bits; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 17 | import org.slf4j.Logger; |
| 18 | import org.slf4j.LoggerFactory; |
| 19 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 20 | import de.ids_mannheim.korap.query.SpanElementQuery; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 21 | |
| Eliza Margaretha | 1c3bf27 | 2014-06-11 11:50:39 +0000 | [diff] [blame] | 22 | /** |
| 23 | * @author Nils Diewald, margaretha |
| Nils Diewald | 6802acd | 2014-03-18 18:29:30 +0000 | [diff] [blame] | 24 | * |
| Nils Diewald | 20607ab | 2014-03-20 23:28:36 +0000 | [diff] [blame] | 25 | * Use copyFrom instead of clone |
| Nils Diewald | 82a4b86 | 2014-02-20 21:17:41 +0000 | [diff] [blame] | 26 | */ |
| Eliza Margaretha | f461127 | 2014-10-16 08:45:33 +0000 | [diff] [blame] | 27 | public class ElementSpans extends SpansWithId { |
| Nils Diewald | 82a4b86 | 2014-02-20 21:17:41 +0000 | [diff] [blame] | 28 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 29 | private List<CandidateElementSpans> candidateList; |
| 30 | private int currentDoc, currentPosition; |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 31 | private TermSpans termSpans; |
| Nils Diewald | 20607ab | 2014-03-20 23:28:36 +0000 | [diff] [blame] | 32 | |
| Eliza Margaretha | 23f9876 | 2014-10-30 17:34:47 +0000 | [diff] [blame] | 33 | private Logger logger = LoggerFactory.getLogger(ElementSpans.class); |
| Nils Diewald | 1455e1e | 2014-08-01 16:12:43 +0000 | [diff] [blame] | 34 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 35 | public ElementSpans(SpanElementQuery spanElementQuery, |
| 36 | AtomicReaderContext context, Bits acceptDocs, |
| 37 | Map<Term, TermContext> termContexts) throws IOException { |
| 38 | super(spanElementQuery, context, acceptDocs, termContexts); |
| 39 | candidateList = new ArrayList<>(); |
| 40 | termSpans = (TermSpans) firstSpans; |
| 41 | hasMoreSpans = termSpans.next(); |
| 42 | if (hasMoreSpans) { |
| 43 | currentDoc = termSpans.doc(); |
| 44 | currentPosition = termSpans.start(); |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 45 | } |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 46 | } |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 47 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 48 | @Override |
| 49 | public boolean next() throws IOException { |
| 50 | isStartEnumeration=false; |
| 51 | return advance(); |
| 52 | } |
| 53 | |
| 54 | /** Get the next match by first checking the candidate match list |
| 55 | * and setting the list when it is empty. |
| 56 | * */ |
| 57 | private boolean advance() throws IOException { |
| 58 | while(hasMoreSpans || !candidateList.isEmpty()){ |
| 59 | if (!candidateList.isEmpty()){ |
| 60 | CandidateElementSpans cs = candidateList.get(0); |
| 61 | this.matchDocNumber = cs.getDoc(); |
| 62 | this.matchStartPosition = cs.getStart(); |
| 63 | this.matchEndPosition = cs.getEnd(); |
| 64 | this.matchPayload = cs.getPayloads(); |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 65 | //this.setElementRef(cs.getSpanId()); |
| 66 | this.setSpanId(cs.getSpanId()); |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 67 | candidateList.remove(0); |
| 68 | return true; |
| 69 | } |
| 70 | else{ |
| Eliza Margaretha | 23f9876 | 2014-10-30 17:34:47 +0000 | [diff] [blame] | 71 | //logger.info("Setting candidate list"); |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 72 | setCandidateList(); |
| 73 | currentDoc = termSpans.doc(); |
| 74 | currentPosition = termSpans.start(); |
| 75 | } |
| 76 | } |
| 77 | return false; |
| 78 | } |
| 79 | |
| 80 | /** Collect all the elements in the same start position and sort them by |
| 81 | * end position (smallest first). |
| 82 | * */ |
| 83 | private void setCandidateList() throws IOException { |
| 84 | while (hasMoreSpans && termSpans.doc() == currentDoc && |
| 85 | termSpans.start() == currentPosition){ |
| 86 | CandidateElementSpans cs = new CandidateElementSpans(termSpans, |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 87 | spanId); |
| 88 | //elementRef); |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 89 | readPayload(cs); |
| 90 | candidateList.add(cs); |
| 91 | hasMoreSpans = termSpans.next(); |
| 92 | } |
| 93 | Collections.sort(candidateList); |
| 94 | } |
| 95 | |
| 96 | |
| 97 | /** This method reads the payload of the termSpan and assigns the end |
| 98 | * position and element ref to the candidate match. The character offset |
| 99 | * payload is set as the candidate match payload. |
| 100 | * <br/><br/> |
| 101 | * <em>Note</em>: payloadbuffer should actually collects all other payload |
| 102 | * beside end position and element ref, but KorapIndex identify element's |
| 103 | * payload by its length (8), which is only the character offsets. So |
| 104 | * these offsets are directly set as the candidate match payload. |
| 105 | * |
| 106 | * @author margaretha |
| 107 | * */ |
| 108 | private void readPayload(CandidateElementSpans cs) throws IOException { |
| Eliza Margaretha | 0170b88 | 2014-10-29 15:49:31 +0000 | [diff] [blame] | 109 | List<byte[]> payload = (List<byte[]>) termSpans.getPayload(); |
| 110 | int length = payload.get(0).length; |
| 111 | ByteBuffer bb = ByteBuffer.allocate(length); |
| 112 | bb.put(payload.get(0)); |
| 113 | |
| 114 | if (!payload.isEmpty()) { |
| 115 | // set element end position from payload |
| 116 | cs.setEnd(bb.getInt(8)); |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 117 | |
| Eliza Margaretha | 0170b88 | 2014-10-29 15:49:31 +0000 | [diff] [blame] | 118 | if (hasSpanId){ // copy element id |
| 119 | cs.setSpanId(bb.getShort(12)); |
| Eliza Margaretha | 1c3bf27 | 2014-06-11 11:50:39 +0000 | [diff] [blame] | 120 | } |
| Eliza Margaretha | 0170b88 | 2014-10-29 15:49:31 +0000 | [diff] [blame] | 121 | else{ // set element id -1 |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 122 | cs.setSpanId((short) -1); |
| Eliza Margaretha | 1c3bf27 | 2014-06-11 11:50:39 +0000 | [diff] [blame] | 123 | } |
| Eliza Margaretha | 0170b88 | 2014-10-29 15:49:31 +0000 | [diff] [blame] | 124 | // Copy the start and end character offsets |
| Eliza Margaretha | 23f9876 | 2014-10-30 17:34:47 +0000 | [diff] [blame] | 125 | byte[] b = new byte[8]; |
| 126 | b = Arrays.copyOfRange(bb.array(), 0, 8); |
| Eliza Margaretha | 0170b88 | 2014-10-29 15:49:31 +0000 | [diff] [blame] | 127 | cs.setPayloads(Collections.singletonList(b)); |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 128 | } |
| 129 | else { |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 130 | cs.setEnd(cs.getStart()); |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 131 | cs.setSpanId((short) -1); |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 132 | cs.setPayloads(null); |
| 133 | } |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 134 | } |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 135 | |
| Eliza Margaretha | fb25cef | 2014-06-06 14:19:07 +0000 | [diff] [blame] | 136 | @Override |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 137 | public boolean skipTo(int target) throws IOException { |
| 138 | if (hasMoreSpans && (firstSpans.doc() < target)){ |
| 139 | if (!firstSpans.skipTo(target)){ |
| 140 | candidateList.clear(); |
| 141 | return false; |
| 142 | } |
| 143 | } |
| 144 | setCandidateList(); |
| 145 | matchPayload.clear(); |
| 146 | isStartEnumeration=false; |
| 147 | return advance(); |
| 148 | } |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 149 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 150 | @Override |
| 151 | public long cost() { |
| 152 | return termSpans.cost(); |
| 153 | } |
| Nils Diewald | 20607ab | 2014-03-20 23:28:36 +0000 | [diff] [blame] | 154 | |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 155 | /** Match candidate for element spans. |
| 156 | * */ |
| Eliza Margaretha | 371eab3 | 2014-10-29 14:53:37 +0000 | [diff] [blame] | 157 | class CandidateElementSpans extends CandidateSpans { |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 158 | |
| 159 | private short elementRef; |
| 160 | |
| 161 | public CandidateElementSpans(Spans span, short elementRef) |
| 162 | throws IOException { |
| 163 | super(span); |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 164 | setSpanId(elementRef); |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 165 | } |
| 166 | |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 167 | public void setSpanId(short elementRef) { |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 168 | this.elementRef = elementRef; |
| 169 | } |
| Eliza Margaretha | 98c200e | 2014-10-15 13:59:58 +0000 | [diff] [blame] | 170 | public short getSpanId() { |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 171 | return elementRef; |
| Eliza Margaretha | e7938d3 | 2014-07-29 12:12:15 +0000 | [diff] [blame] | 172 | } |
| Eliza Margaretha | c7fb731 | 2014-07-25 14:11:36 +0000 | [diff] [blame] | 173 | } |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 174 | }; |