blob: 43163c85168f1deaf820eb9f1f94f2ec93d35612 [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap.query.spans;
2
Eliza Margarethac7fb7312014-07-25 14:11:36 +00003import java.io.IOException;
Nils Diewaldf399a672013-11-18 17:55:22 +00004import java.nio.ByteBuffer;
Eliza Margarethac7fb7312014-07-25 14:11:36 +00005import java.util.ArrayList;
Eliza Margaretha23f98762014-10-30 17:34:47 +00006import java.util.Arrays;
Eliza Margarethac7fb7312014-07-25 14:11:36 +00007import java.util.Collections;
8import java.util.List;
9import java.util.Map;
Nils Diewaldf399a672013-11-18 17:55:22 +000010
Eliza Margarethac7fb7312014-07-25 14:11:36 +000011import org.apache.lucene.index.AtomicReaderContext;
12import org.apache.lucene.index.Term;
13import org.apache.lucene.index.TermContext;
14import org.apache.lucene.search.spans.Spans;
15import org.apache.lucene.search.spans.TermSpans;
16import org.apache.lucene.util.Bits;
Nils Diewaldf399a672013-11-18 17:55:22 +000017import org.slf4j.Logger;
18import org.slf4j.LoggerFactory;
19
Eliza Margarethac7fb7312014-07-25 14:11:36 +000020import de.ids_mannheim.korap.query.SpanElementQuery;
Nils Diewaldf399a672013-11-18 17:55:22 +000021
Eliza Margaretha1c3bf272014-06-11 11:50:39 +000022/**
23 * @author Nils Diewald, margaretha
Nils Diewald6802acd2014-03-18 18:29:30 +000024 *
Nils Diewald20607ab2014-03-20 23:28:36 +000025 * Use copyFrom instead of clone
Nils Diewald82a4b862014-02-20 21:17:41 +000026 */
Eliza Margarethaf4611272014-10-16 08:45:33 +000027public class ElementSpans extends SpansWithId {
Nils Diewald82a4b862014-02-20 21:17:41 +000028
Eliza Margarethac7fb7312014-07-25 14:11:36 +000029 private List<CandidateElementSpans> candidateList;
30 private int currentDoc, currentPosition;
Eliza Margaretha98c200e2014-10-15 13:59:58 +000031 private TermSpans termSpans;
Nils Diewald20607ab2014-03-20 23:28:36 +000032
Eliza Margaretha23f98762014-10-30 17:34:47 +000033 private Logger logger = LoggerFactory.getLogger(ElementSpans.class);
Nils Diewald1455e1e2014-08-01 16:12:43 +000034
Eliza Margarethac7fb7312014-07-25 14:11:36 +000035 public ElementSpans(SpanElementQuery spanElementQuery,
36 AtomicReaderContext context, Bits acceptDocs,
37 Map<Term, TermContext> termContexts) throws IOException {
38 super(spanElementQuery, context, acceptDocs, termContexts);
39 candidateList = new ArrayList<>();
40 termSpans = (TermSpans) firstSpans;
41 hasMoreSpans = termSpans.next();
42 if (hasMoreSpans) {
43 currentDoc = termSpans.doc();
44 currentPosition = termSpans.start();
Eliza Margarethafb25cef2014-06-06 14:19:07 +000045 }
Eliza Margarethac7fb7312014-07-25 14:11:36 +000046 }
Nils Diewaldf399a672013-11-18 17:55:22 +000047
Eliza Margarethac7fb7312014-07-25 14:11:36 +000048 @Override
49 public boolean next() throws IOException {
50 isStartEnumeration=false;
51 return advance();
52 }
53
54 /** Get the next match by first checking the candidate match list
55 * and setting the list when it is empty.
56 * */
57 private boolean advance() throws IOException {
58 while(hasMoreSpans || !candidateList.isEmpty()){
59 if (!candidateList.isEmpty()){
60 CandidateElementSpans cs = candidateList.get(0);
61 this.matchDocNumber = cs.getDoc();
62 this.matchStartPosition = cs.getStart();
63 this.matchEndPosition = cs.getEnd();
64 this.matchPayload = cs.getPayloads();
Eliza Margaretha98c200e2014-10-15 13:59:58 +000065 //this.setElementRef(cs.getSpanId());
66 this.setSpanId(cs.getSpanId());
Eliza Margarethac7fb7312014-07-25 14:11:36 +000067 candidateList.remove(0);
68 return true;
69 }
70 else{
Eliza Margaretha23f98762014-10-30 17:34:47 +000071 //logger.info("Setting candidate list");
Eliza Margarethac7fb7312014-07-25 14:11:36 +000072 setCandidateList();
73 currentDoc = termSpans.doc();
74 currentPosition = termSpans.start();
75 }
76 }
77 return false;
78 }
79
80 /** Collect all the elements in the same start position and sort them by
81 * end position (smallest first).
82 * */
83 private void setCandidateList() throws IOException {
84 while (hasMoreSpans && termSpans.doc() == currentDoc &&
85 termSpans.start() == currentPosition){
86 CandidateElementSpans cs = new CandidateElementSpans(termSpans,
Eliza Margaretha98c200e2014-10-15 13:59:58 +000087 spanId);
88 //elementRef);
Eliza Margarethac7fb7312014-07-25 14:11:36 +000089 readPayload(cs);
90 candidateList.add(cs);
91 hasMoreSpans = termSpans.next();
92 }
93 Collections.sort(candidateList);
94 }
95
96
97 /** This method reads the payload of the termSpan and assigns the end
98 * position and element ref to the candidate match. The character offset
99 * payload is set as the candidate match payload.
100 * <br/><br/>
101 * <em>Note</em>: payloadbuffer should actually collects all other payload
102 * beside end position and element ref, but KorapIndex identify element's
103 * payload by its length (8), which is only the character offsets. So
104 * these offsets are directly set as the candidate match payload.
105 *
106 * @author margaretha
107 * */
108 private void readPayload(CandidateElementSpans cs) throws IOException {
Eliza Margaretha0170b882014-10-29 15:49:31 +0000109 List<byte[]> payload = (List<byte[]>) termSpans.getPayload();
110 int length = payload.get(0).length;
111 ByteBuffer bb = ByteBuffer.allocate(length);
112 bb.put(payload.get(0));
113
114 if (!payload.isEmpty()) {
115 // set element end position from payload
116 cs.setEnd(bb.getInt(8));
Eliza Margarethac7fb7312014-07-25 14:11:36 +0000117
Eliza Margaretha0170b882014-10-29 15:49:31 +0000118 if (hasSpanId){ // copy element id
119 cs.setSpanId(bb.getShort(12));
Eliza Margaretha1c3bf272014-06-11 11:50:39 +0000120 }
Eliza Margaretha0170b882014-10-29 15:49:31 +0000121 else{ // set element id -1
Eliza Margaretha98c200e2014-10-15 13:59:58 +0000122 cs.setSpanId((short) -1);
Eliza Margaretha1c3bf272014-06-11 11:50:39 +0000123 }
Eliza Margaretha0170b882014-10-29 15:49:31 +0000124 // Copy the start and end character offsets
Eliza Margaretha23f98762014-10-30 17:34:47 +0000125 byte[] b = new byte[8];
126 b = Arrays.copyOfRange(bb.array(), 0, 8);
Eliza Margaretha0170b882014-10-29 15:49:31 +0000127 cs.setPayloads(Collections.singletonList(b));
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000128 }
129 else {
Eliza Margarethac7fb7312014-07-25 14:11:36 +0000130 cs.setEnd(cs.getStart());
Eliza Margaretha98c200e2014-10-15 13:59:58 +0000131 cs.setSpanId((short) -1);
Eliza Margarethac7fb7312014-07-25 14:11:36 +0000132 cs.setPayloads(null);
133 }
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000134 }
Eliza Margarethac7fb7312014-07-25 14:11:36 +0000135
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000136 @Override
Eliza Margarethac7fb7312014-07-25 14:11:36 +0000137 public boolean skipTo(int target) throws IOException {
138 if (hasMoreSpans && (firstSpans.doc() < target)){
139 if (!firstSpans.skipTo(target)){
140 candidateList.clear();
141 return false;
142 }
143 }
144 setCandidateList();
145 matchPayload.clear();
146 isStartEnumeration=false;
147 return advance();
148 }
Nils Diewaldf399a672013-11-18 17:55:22 +0000149
Eliza Margarethac7fb7312014-07-25 14:11:36 +0000150 @Override
151 public long cost() {
152 return termSpans.cost();
153 }
Nils Diewald20607ab2014-03-20 23:28:36 +0000154
Eliza Margarethac7fb7312014-07-25 14:11:36 +0000155 /** Match candidate for element spans.
156 * */
Eliza Margaretha371eab32014-10-29 14:53:37 +0000157 class CandidateElementSpans extends CandidateSpans {
Eliza Margarethac7fb7312014-07-25 14:11:36 +0000158
159 private short elementRef;
160
161 public CandidateElementSpans(Spans span, short elementRef)
162 throws IOException {
163 super(span);
Eliza Margaretha98c200e2014-10-15 13:59:58 +0000164 setSpanId(elementRef);
Eliza Margarethac7fb7312014-07-25 14:11:36 +0000165 }
166
Eliza Margaretha98c200e2014-10-15 13:59:58 +0000167 public void setSpanId(short elementRef) {
Eliza Margarethac7fb7312014-07-25 14:11:36 +0000168 this.elementRef = elementRef;
169 }
Eliza Margaretha98c200e2014-10-15 13:59:58 +0000170 public short getSpanId() {
Eliza Margarethac7fb7312014-07-25 14:11:36 +0000171 return elementRef;
Eliza Margarethae7938d32014-07-29 12:12:15 +0000172 }
Eliza Margarethac7fb7312014-07-25 14:11:36 +0000173 }
Nils Diewaldf399a672013-11-18 17:55:22 +0000174};