blob: 3bd0d74ffac5c2f17a8bb12901cab3031b1e7d0d [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap.index;
2
3import java.util.*;
4import java.io.*;
5
6import java.nio.ByteBuffer;
7
8import org.apache.lucene.index.DocsAndPositionsEnum;
9import org.apache.lucene.index.AtomicReaderContext;
10import org.apache.lucene.index.Terms;
11import org.apache.lucene.index.Term;
12import org.apache.lucene.index.TermsEnum;
13import org.apache.lucene.util.BytesRef;
14
15import org.slf4j.Logger;
16import org.slf4j.LoggerFactory;
17
Nils Diewaldf399a672013-11-18 17:55:22 +000018public class PositionsToOffset {
19 private String field;
20 private AtomicReaderContext atomic;
21 private boolean processed = false;
22 private Integer[] pair;
Nils Diewald1e5d5942014-05-20 13:29:53 +000023 private static ByteBuffer bbOffset =
24 ByteBuffer.allocate(8);
Nils Diewaldf399a672013-11-18 17:55:22 +000025
26 HashSet<PositionsToOffsetArray> positions;
27 HashMap<PositionsToOffsetArray, Integer[]> offsets;
28
Nils Diewald1e5d5942014-05-20 13:29:53 +000029 private final static Logger log =
30 LoggerFactory.getLogger(PositionsToOffset.class);
Nils Diewaldf399a672013-11-18 17:55:22 +000031
Nils Diewald82a4b862014-02-20 21:17:41 +000032 // This advices the java compiler to ignore all loggings
33 public static final boolean DEBUG = false;
34
Nils Diewaldf399a672013-11-18 17:55:22 +000035 private class PositionsToOffsetArray {
36 public int docID;
37 public int pos;
38
39 public PositionsToOffsetArray (int docID, int pos) {
40 this.docID = docID;
41 this.pos = pos;
42 };
43
44 public int hashCode(){
45 long hashCode;
46 hashCode = (docID * Integer.MAX_VALUE) - Integer.MAX_VALUE + pos;
47 return new Long(hashCode).hashCode();
48 };
49
50 public boolean equals(Object obj){
51 if (obj instanceof PositionsToOffsetArray) {
52 PositionsToOffsetArray ptoa = (PositionsToOffsetArray) obj;
53 return (ptoa.docID == this.docID && ptoa.pos == this.pos);
54 };
55 return false;
56 };
57 };
58
59 public PositionsToOffset (AtomicReaderContext atomic, String field) {
60 this.field = field;
61 this.atomic = atomic;
62 this.positions = new HashSet<>(64);
63 this.offsets = new HashMap<>(64);
64 };
65
66 public void clear () {
67 this.positions.clear();
68 this.offsets.clear();
69 this.bbOffset.clear();
70 this.processed = false;
71 };
72
73 public void add (int docID, int pos) {
74 this.add(new PositionsToOffsetArray(docID, pos));
75 };
76
77 public void add (PositionsToOffsetArray ptoa) {
Nils Diewald82a4b862014-02-20 21:17:41 +000078 if (DEBUG)
79 log.trace("Add positionsToOffsetArray {}/{}", ptoa.docID, ptoa.pos);
Nils Diewald20607ab2014-03-20 23:28:36 +000080 if (ptoa.pos < 0)
81 return;
82
Nils Diewald833fe7e2013-12-14 16:06:33 +000083 if (this.processed && this.exists(ptoa))
84 return;
Nils Diewald82a4b862014-02-20 21:17:41 +000085
86 if (DEBUG)
87 log.trace("Reopen processing");
88
Nils Diewaldf399a672013-11-18 17:55:22 +000089 this.positions.add(ptoa);
90 this.processed = false;
91 };
92
93 public boolean exists (int docID, int pos) {
94 return this.offsets.containsKey(new PositionsToOffsetArray(docID, pos));
95 };
96
97 public boolean exists (PositionsToOffsetArray ptoa) {
98 return this.offsets.containsKey(ptoa);
99 };
100
101 public int start (int docID, int pos) {
102 return this.start(new PositionsToOffsetArray(docID, pos));
103 };
104
105 public int start (PositionsToOffsetArray ptoa) {
106 if (ptoa.pos < 0)
107 return 0;
108
109 if (!processed)
110 this.offsets();
111
112 Integer[] pair = this.offsets.get(ptoa);
113
114 if (pair == null)
115 return 0;
116
117 return pair[0];
118 };
119
120 public int end (int docID, int pos) {
121 return this.end(new PositionsToOffsetArray(docID, pos));
122 };
123
124 public int end (PositionsToOffsetArray ptoa) {
125 if (ptoa.pos < 0)
126 return -1;
127
128 if (!processed)
129 this.offsets();
130
131 Integer[] pair = this.offsets.get(ptoa);
132 if (pair == null)
133 return -1;
Nils Diewald3ef9a472013-12-02 16:06:09 +0000134
Nils Diewaldf399a672013-11-18 17:55:22 +0000135 return pair[1];
136 };
137
138 public Integer[] span (int docID, int pos) {
139 return this.span(new PositionsToOffsetArray(docID, pos));
140 };
141
142 public Integer[] span (PositionsToOffsetArray ptoa) {
143 if (!processed)
144 this.offsets();
145 return this.offsets.get(ptoa);
146 };
147
Nils Diewaldcde69082014-01-16 15:46:48 +0000148 public void addOffset (int docID,
149 int pos,
150 int startOffset,
151 int endOffset) {
152 offsets.put(
153 new PositionsToOffsetArray(docID, pos),
154 new Integer[]{startOffset, endOffset}
155 );
156 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000157
158 public HashMap<PositionsToOffsetArray, Integer[]> offsets () {
159 if (processed)
160 return offsets;
161
Nils Diewald82a4b862014-02-20 21:17:41 +0000162 if (DEBUG)
163 log.trace("Process offsets");
Nils Diewald833fe7e2013-12-14 16:06:33 +0000164
Nils Diewaldf399a672013-11-18 17:55:22 +0000165 StringBuilder sb = new StringBuilder().append('_');
166
167 try {
168 Terms terms = atomic.reader().fields().terms(field);
169
170 if (terms != null) {
Nils Diewald833fe7e2013-12-14 16:06:33 +0000171 // TODO: Maybe reuse a termsEnum!
Nils Diewaldf399a672013-11-18 17:55:22 +0000172
173 final TermsEnum termsEnum = terms.iterator(null);
174
175 for (PositionsToOffsetArray posDoc : positions) {
176 if (this.exists(posDoc))
177 continue;
178
179 int docID = posDoc.docID;
180
181 /*
182 int pos = posDoc[1];
183 Integer[] posDoc2 = new Integer[2];
184 posDoc2[0] = docID;
185 posDoc2[1] = pos;
186 */
187
188 sb.append(posDoc.pos);
189
190 Term term = new Term(field, sb.toString());
191 sb.setLength(1);
Nils Diewald67f54042014-09-27 14:53:38 +0000192
193 // Set the position in the iterator to the term that is seeked
194 if (termsEnum.seekExact(term.bytes())) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000195
Nils Diewald82a4b862014-02-20 21:17:41 +0000196 if (DEBUG)
197 log.trace("Search for {} in doc {} with pos {}",
198 term.toString(),
199 posDoc.docID,
200 posDoc.pos);
Nils Diewaldf399a672013-11-18 17:55:22 +0000201
202 // Start an iterator to fetch all payloads of the term
203 DocsAndPositionsEnum docs = termsEnum.docsAndPositions(
204 null,
205 null,
206 DocsAndPositionsEnum.FLAG_PAYLOADS
207 );
208
209 if (docs.advance(docID) == docID) {
210 docs.nextPosition();
211
212 BytesRef payload = docs.getPayload();
213
214 if (payload.length == 8) {
215 bbOffset.clear();
216 bbOffset.put(payload.bytes, payload.offset, 8);
217 bbOffset.rewind();
218 Integer[] offsetArray = new Integer[2];
219 offsetArray[0] = bbOffset.getInt();
220 offsetArray[1] = bbOffset.getInt();
221 offsets.put(posDoc, offsetArray);
222
Nils Diewald82a4b862014-02-20 21:17:41 +0000223 if (DEBUG)
224 log.trace("Found {}-{} for {}",
225 offsetArray[0],
226 offsetArray[1],
227 term.toString());
Nils Diewaldf399a672013-11-18 17:55:22 +0000228 }
229
230 else {
231 log.error(
232 "Doc {} has no offsets stored for {}",
233 docID,
234 term.toString()
235 );
236 };
237 };
238 };
239 };
240 };
241 }
242 catch (IOException e) {
Nils Diewald3caa00d2013-12-13 02:24:04 +0000243 log.warn(e.getLocalizedMessage());
Nils Diewaldf399a672013-11-18 17:55:22 +0000244 };
245
246 processed = true;
247 positions.clear();
248 return offsets;
249 };
Nils Diewald1e5d5942014-05-20 13:29:53 +0000250
251 public AtomicReaderContext getAtomicReader () {
252 return this.atomic;
253 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000254};