blob: c471c5134e724c53400cac345be33bfe7514625b [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap.index;
2
3import java.util.*;
4import java.io.*;
5
6import java.nio.ByteBuffer;
7
8import org.apache.lucene.index.DocsAndPositionsEnum;
Akron700c1eb2015-09-25 16:57:30 +02009import org.apache.lucene.index.LeafReaderContext;
Nils Diewaldf399a672013-11-18 17:55:22 +000010import org.apache.lucene.index.Terms;
11import org.apache.lucene.index.Term;
12import org.apache.lucene.index.TermsEnum;
13import org.apache.lucene.util.BytesRef;
14
15import org.slf4j.Logger;
16import org.slf4j.LoggerFactory;
17
Nils Diewaldf399a672013-11-18 17:55:22 +000018public class PositionsToOffset {
19 private String field;
Akron700c1eb2015-09-25 16:57:30 +020020 private LeafReaderContext atomic;
Nils Diewaldf399a672013-11-18 17:55:22 +000021 private boolean processed = false;
22 private Integer[] pair;
Nils Diewaldbb33da22015-03-04 16:24:25 +000023 private static ByteBuffer bbOffset = ByteBuffer.allocate(8);
Nils Diewaldf399a672013-11-18 17:55:22 +000024
25 HashSet<PositionsToOffsetArray> positions;
26 HashMap<PositionsToOffsetArray, Integer[]> offsets;
27
Nils Diewaldbb33da22015-03-04 16:24:25 +000028 private final static Logger log = LoggerFactory
29 .getLogger(PositionsToOffset.class);
Nils Diewaldf399a672013-11-18 17:55:22 +000030
Nils Diewald82a4b862014-02-20 21:17:41 +000031 // This advices the java compiler to ignore all loggings
32 public static final boolean DEBUG = false;
33
Nils Diewaldf399a672013-11-18 17:55:22 +000034 private class PositionsToOffsetArray {
Nils Diewaldbb33da22015-03-04 16:24:25 +000035 public int docID;
36 public int pos;
Nils Diewaldf399a672013-11-18 17:55:22 +000037
Nils Diewaldbb33da22015-03-04 16:24:25 +000038
39 public PositionsToOffsetArray (int docID, int pos) {
40 this.docID = docID;
41 this.pos = pos;
42 };
43
44
45 public int hashCode () {
46 long hashCode;
47 hashCode = (docID * Integer.MAX_VALUE) - Integer.MAX_VALUE + pos;
48 return new Long(hashCode).hashCode();
49 };
50
51
52 public boolean equals (Object obj) {
53 if (obj instanceof PositionsToOffsetArray) {
54 PositionsToOffsetArray ptoa = (PositionsToOffsetArray) obj;
55 return (ptoa.docID == this.docID && ptoa.pos == this.pos);
56 };
57 return false;
58 };
Nils Diewaldf399a672013-11-18 17:55:22 +000059 };
60
Nils Diewaldbb33da22015-03-04 16:24:25 +000061
Akron700c1eb2015-09-25 16:57:30 +020062 public PositionsToOffset (LeafReaderContext atomic, String field) {
Nils Diewaldbb33da22015-03-04 16:24:25 +000063 this.field = field;
64 this.atomic = atomic;
65 this.positions = new HashSet<>(64);
66 this.offsets = new HashMap<>(64);
Nils Diewaldf399a672013-11-18 17:55:22 +000067 };
68
Nils Diewaldbb33da22015-03-04 16:24:25 +000069
Nils Diewaldf399a672013-11-18 17:55:22 +000070 public void clear () {
Nils Diewaldbb33da22015-03-04 16:24:25 +000071 this.positions.clear();
72 this.offsets.clear();
73 this.bbOffset.clear();
74 this.processed = false;
Nils Diewaldf399a672013-11-18 17:55:22 +000075 };
76
Nils Diewaldbb33da22015-03-04 16:24:25 +000077
Nils Diewaldf399a672013-11-18 17:55:22 +000078 public void add (int docID, int pos) {
Nils Diewaldbb33da22015-03-04 16:24:25 +000079 this.add(new PositionsToOffsetArray(docID, pos));
Nils Diewaldf399a672013-11-18 17:55:22 +000080 };
81
Nils Diewaldbb33da22015-03-04 16:24:25 +000082
Nils Diewaldf399a672013-11-18 17:55:22 +000083 public void add (PositionsToOffsetArray ptoa) {
Nils Diewaldbb33da22015-03-04 16:24:25 +000084 if (DEBUG)
85 log.trace("Add positionsToOffsetArray {}/{}", ptoa.docID, ptoa.pos);
86 if (ptoa.pos < 0)
87 return;
Nils Diewald20607ab2014-03-20 23:28:36 +000088
Nils Diewaldbb33da22015-03-04 16:24:25 +000089 if (this.processed && this.exists(ptoa))
90 return;
Nils Diewald82a4b862014-02-20 21:17:41 +000091
Nils Diewaldbb33da22015-03-04 16:24:25 +000092 if (DEBUG)
93 log.trace("Reopen processing");
Nils Diewald82a4b862014-02-20 21:17:41 +000094
Nils Diewaldbb33da22015-03-04 16:24:25 +000095 this.positions.add(ptoa);
96 this.processed = false;
Nils Diewaldf399a672013-11-18 17:55:22 +000097 };
98
Nils Diewaldbb33da22015-03-04 16:24:25 +000099
Nils Diewaldf399a672013-11-18 17:55:22 +0000100 public boolean exists (int docID, int pos) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000101 return this.offsets.containsKey(new PositionsToOffsetArray(docID, pos));
Nils Diewaldf399a672013-11-18 17:55:22 +0000102 };
103
Nils Diewaldbb33da22015-03-04 16:24:25 +0000104
Nils Diewaldf399a672013-11-18 17:55:22 +0000105 public boolean exists (PositionsToOffsetArray ptoa) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000106 return this.offsets.containsKey(ptoa);
Nils Diewaldf399a672013-11-18 17:55:22 +0000107 };
108
Nils Diewaldbb33da22015-03-04 16:24:25 +0000109
Nils Diewaldf399a672013-11-18 17:55:22 +0000110 public int start (int docID, int pos) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000111 return this.start(new PositionsToOffsetArray(docID, pos));
Nils Diewaldf399a672013-11-18 17:55:22 +0000112 };
113
Nils Diewaldbb33da22015-03-04 16:24:25 +0000114
Nils Diewaldf399a672013-11-18 17:55:22 +0000115 public int start (PositionsToOffsetArray ptoa) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000116 if (ptoa.pos < 0)
117 return 0;
Nils Diewaldf399a672013-11-18 17:55:22 +0000118
Nils Diewaldbb33da22015-03-04 16:24:25 +0000119 if (!processed)
120 this.offsets();
Nils Diewaldf399a672013-11-18 17:55:22 +0000121
Nils Diewaldbb33da22015-03-04 16:24:25 +0000122 Integer[] pair = this.offsets.get(ptoa);
Nils Diewaldf399a672013-11-18 17:55:22 +0000123
Nils Diewaldbb33da22015-03-04 16:24:25 +0000124 if (pair == null)
125 return 0;
Nils Diewaldf399a672013-11-18 17:55:22 +0000126
Nils Diewaldbb33da22015-03-04 16:24:25 +0000127 return pair[0];
Nils Diewaldf399a672013-11-18 17:55:22 +0000128 };
129
Nils Diewaldbb33da22015-03-04 16:24:25 +0000130
Nils Diewaldf399a672013-11-18 17:55:22 +0000131 public int end (int docID, int pos) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000132 return this.end(new PositionsToOffsetArray(docID, pos));
Nils Diewaldf399a672013-11-18 17:55:22 +0000133 };
134
Nils Diewaldbb33da22015-03-04 16:24:25 +0000135
Nils Diewaldf399a672013-11-18 17:55:22 +0000136 public int end (PositionsToOffsetArray ptoa) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000137 if (ptoa.pos < 0)
138 return -1;
Nils Diewaldf399a672013-11-18 17:55:22 +0000139
Nils Diewaldbb33da22015-03-04 16:24:25 +0000140 if (!processed)
141 this.offsets();
Nils Diewaldf399a672013-11-18 17:55:22 +0000142
Nils Diewaldbb33da22015-03-04 16:24:25 +0000143 Integer[] pair = this.offsets.get(ptoa);
144 if (pair == null)
145 return -1;
Nils Diewald3ef9a472013-12-02 16:06:09 +0000146
Nils Diewaldbb33da22015-03-04 16:24:25 +0000147 return pair[1];
Nils Diewaldf399a672013-11-18 17:55:22 +0000148 };
149
Nils Diewaldbb33da22015-03-04 16:24:25 +0000150
Nils Diewaldf399a672013-11-18 17:55:22 +0000151 public Integer[] span (int docID, int pos) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000152 return this.span(new PositionsToOffsetArray(docID, pos));
Nils Diewaldf399a672013-11-18 17:55:22 +0000153 };
154
Nils Diewaldbb33da22015-03-04 16:24:25 +0000155
Nils Diewaldf399a672013-11-18 17:55:22 +0000156 public Integer[] span (PositionsToOffsetArray ptoa) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000157 if (!processed)
158 this.offsets();
159 return this.offsets.get(ptoa);
Nils Diewaldf399a672013-11-18 17:55:22 +0000160 };
161
Nils Diewaldbb33da22015-03-04 16:24:25 +0000162
163 public void addOffset (int docID, int pos, int startOffset, int endOffset) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200164 offsets.put(new PositionsToOffsetArray(docID, pos),
165 new Integer[] { startOffset, endOffset });
Nils Diewaldcde69082014-01-16 15:46:48 +0000166 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000167
Nils Diewaldbb33da22015-03-04 16:24:25 +0000168
Nils Diewaldf399a672013-11-18 17:55:22 +0000169 public HashMap<PositionsToOffsetArray, Integer[]> offsets () {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000170 if (processed)
171 return offsets;
Nils Diewaldf399a672013-11-18 17:55:22 +0000172
Nils Diewaldbb33da22015-03-04 16:24:25 +0000173 if (DEBUG)
174 log.trace("Process offsets");
Nils Diewald833fe7e2013-12-14 16:06:33 +0000175
Nils Diewaldbb33da22015-03-04 16:24:25 +0000176 StringBuilder sb = new StringBuilder().append('_');
Nils Diewaldf399a672013-11-18 17:55:22 +0000177
Nils Diewaldbb33da22015-03-04 16:24:25 +0000178 try {
179 Terms terms = atomic.reader().fields().terms(field);
Nils Diewaldf399a672013-11-18 17:55:22 +0000180
Nils Diewaldbb33da22015-03-04 16:24:25 +0000181 if (terms != null) {
182 // TODO: Maybe reuse a termsEnum!
Nils Diewaldf399a672013-11-18 17:55:22 +0000183
Nils Diewaldbb33da22015-03-04 16:24:25 +0000184 final TermsEnum termsEnum = terms.iterator(null);
Nils Diewaldf399a672013-11-18 17:55:22 +0000185
Nils Diewaldbb33da22015-03-04 16:24:25 +0000186 for (PositionsToOffsetArray posDoc : positions) {
187 if (this.exists(posDoc))
188 continue;
Nils Diewaldf399a672013-11-18 17:55:22 +0000189
Nils Diewaldbb33da22015-03-04 16:24:25 +0000190 int docID = posDoc.docID;
Nils Diewaldf399a672013-11-18 17:55:22 +0000191
Nils Diewaldbb33da22015-03-04 16:24:25 +0000192 /*
193 int pos = posDoc[1];
194 Integer[] posDoc2 = new Integer[2];
195 posDoc2[0] = docID;
196 posDoc2[1] = pos;
197 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000198
Nils Diewaldbb33da22015-03-04 16:24:25 +0000199 sb.append(posDoc.pos);
Nils Diewaldf399a672013-11-18 17:55:22 +0000200
Nils Diewaldbb33da22015-03-04 16:24:25 +0000201 Term term = new Term(field, sb.toString());
202 sb.setLength(1);
Nils Diewaldf399a672013-11-18 17:55:22 +0000203
Nils Diewaldbb33da22015-03-04 16:24:25 +0000204 // Set the position in the iterator to the term that is seeked
205 if (termsEnum.seekExact(term.bytes())) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000206
Nils Diewaldbb33da22015-03-04 16:24:25 +0000207 if (DEBUG)
208 log.trace("Search for {} in doc {} with pos {}",
209 term.toString(), posDoc.docID, posDoc.pos);
Nils Diewaldf399a672013-11-18 17:55:22 +0000210
Nils Diewaldbb33da22015-03-04 16:24:25 +0000211 // Start an iterator to fetch all payloads of the term
212 DocsAndPositionsEnum docs = termsEnum.docsAndPositions(
213 null, null, DocsAndPositionsEnum.FLAG_PAYLOADS);
Nils Diewaldf399a672013-11-18 17:55:22 +0000214
Nils Diewaldbb33da22015-03-04 16:24:25 +0000215 if (docs.advance(docID) == docID) {
216 docs.nextPosition();
Nils Diewaldf399a672013-11-18 17:55:22 +0000217
Nils Diewaldbb33da22015-03-04 16:24:25 +0000218 BytesRef payload = docs.getPayload();
Nils Diewaldf399a672013-11-18 17:55:22 +0000219
Nils Diewaldbb33da22015-03-04 16:24:25 +0000220 if (payload.length == 8) {
221 bbOffset.clear();
222 bbOffset.put(payload.bytes, payload.offset, 8);
223 bbOffset.rewind();
224 Integer[] offsetArray = new Integer[2];
225 offsetArray[0] = bbOffset.getInt();
226 offsetArray[1] = bbOffset.getInt();
227 offsets.put(posDoc, offsetArray);
Nils Diewaldf399a672013-11-18 17:55:22 +0000228
Nils Diewaldbb33da22015-03-04 16:24:25 +0000229 if (DEBUG)
230 log.trace("Found {}-{} for {}",
231 offsetArray[0], offsetArray[1],
232 term.toString());
233 }
234
235 else {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200236 log.error("Doc {} has no offsets stored for {}",
Nils Diewaldbb33da22015-03-04 16:24:25 +0000237 docID, term.toString());
238 };
239 };
240 };
241 };
242 };
243 }
244 catch (IOException e) {
245 log.warn(e.getLocalizedMessage());
246 };
247
248 processed = true;
249 positions.clear();
250 return offsets;
Nils Diewaldf399a672013-11-18 17:55:22 +0000251 };
Nils Diewald1e5d5942014-05-20 13:29:53 +0000252
Nils Diewaldbb33da22015-03-04 16:24:25 +0000253
Akron700c1eb2015-09-25 16:57:30 +0200254 public LeafReaderContext getLeafReader () {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000255 return this.atomic;
Nils Diewald1e5d5942014-05-20 13:29:53 +0000256 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000257};