blob: f9ff3d6a7ffe5b225d36751d88e312f404841783 [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap.analysis;
2
3import static de.ids_mannheim.korap.util.KorapArray.*;
4import org.apache.lucene.util.BytesRef;
5import java.nio.ByteBuffer;
6import java.util.*;
7
8
9/**
10 * @author Nils Diewald
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000011 * @version 0.3
Nils Diewaldf399a672013-11-18 17:55:22 +000012 *
13 * MultiTerm represents a term in a MultiTermToken.
14 */
15public class MultiTerm {
16 public int start, end = 0;
17 public String term = null;
18 public Integer posIncr = 1;
19 public boolean storeOffsets = false;
20 public BytesRef payload = null;
21
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000022 private static ByteBuffer bb = ByteBuffer.allocate(8);
23 private static String[] stringOffset;
24
25 private static short i, l;
26
27
Nils Diewaldf399a672013-11-18 17:55:22 +000028 /**
29 * The constructor.
30 *
31 * @param term The term surface.
32 Offsets can be written as an appended and dash separated pair of integers,
33 payloads can be written following a dollar sign.
34 payloads can be typed as being a short (s), an integer (i), or a long (l)
35 in leading angular brackets. All other payloads are treated as being UTF-8
36 characer sequences.
37
38 Examples:
39 MultiTerm test = new MultiTerm("test");
40 MultiTerm test = new MultiTerm("test#0-4");
41 MultiTerm test = new MultiTerm("test#0-4$Example");
42 MultiTerm test = new MultiTerm("test#0-4$<i>1278");
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000043
44 Strings that are malformed fail silently.
Nils Diewaldf399a672013-11-18 17:55:22 +000045 */
46 public MultiTerm (String term) {
Nils Diewaldf399a672013-11-18 17:55:22 +000047 _fromString(term);
48 };
49
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000050
Nils Diewaldf399a672013-11-18 17:55:22 +000051 /**
52 * The constructor with a separated prefix.
53 * new MultiTerm('a', "bcd") is equivalent to
54 * new MultiTerm("a:bcd");
55 *
56 * @param prefix A special prefix for the term.
57 * @param term The term surface.
58 *
59 * @see #MultiTerm(String)
60 */
61 public MultiTerm (char prefix, String term) {
62 StringBuilder sb = new StringBuilder();
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000063 _fromString(sb.append(prefix).append(':').append(term).toString());
Nils Diewaldf399a672013-11-18 17:55:22 +000064 };
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000065
Nils Diewaldf399a672013-11-18 17:55:22 +000066 /**
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000067 * The empty constructor.
Nils Diewaldf399a672013-11-18 17:55:22 +000068 */
69 public MultiTerm () {
70 this.term = "";
Nils Diewaldf399a672013-11-18 17:55:22 +000071 };
72
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000073
74 /**
75 * Sets the term value.
76 *
77 * @param term The term as a string
78 */
79 public void setTerm (String term) {
80 this.term = term;
81 };
82
83
84 /**
85 * Returns the term value.
86 *
87 * @return The term value.
88 */
89 public String getTerm () {
90 return this.term;
91 };
92
93
94 /**
95 * Set the payload as a byte value.
96 *
97 * @param pl The payload.
98 */
99 public void setPayload (Byte pl) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000100 this.payload = new BytesRef( ByteBuffer.allocate(1).put(pl).array());
101 };
102
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000103
104 /**
105 * Set the payload as a short value.
106 *
107 * @param pl The payload.
108 */
109 public void setPayload (short pl) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000110 this.payload = new BytesRef( ByteBuffer.allocate(2).putShort(pl).array());
111 };
112
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000113
114 /**
115 * Set the payload as an integer value.
116 *
117 * @param pl The payload.
118 */
119 public void setPayload (int pl) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000120 this.payload = new BytesRef( ByteBuffer.allocate(4).putInt(pl).array());
121 };
122
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000123
124 /**
125 * Set the payload as a long value.
126 *
127 * @param pl The payload.
128 */
129 public void setPayload (long pl) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000130 this.payload = new BytesRef( ByteBuffer.allocate(8).putLong(pl).array());
131 };
132
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000133
134 /**
135 * Set the payload as a string value.
136 *
137 * @param pl The payload.
138 */
139 public void setPayload (String pl) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000140 this.payload = new BytesRef(pl);
141 };
142
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000143
144 /**
145 * Set the payload as a byte array.
146 *
147 * @param pl The payload.
148 */
149 public void setPayload (byte[] pl) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000150 this.payload = new BytesRef(pl);
151 };
152
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000153
154 /**
155 * Set the payload as a BytesRef.
156 *
157 * @param pl The payload.
158 */
159 public void setPayload (BytesRef pl) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000160 this.payload = pl;
161 };
162
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000163 /**
164 * Get the payload.
165 *
166 * @return The payload as a BytesRef.
167 */
168 public BytesRef getPayload () {
Nils Diewaldf399a672013-11-18 17:55:22 +0000169 return this.payload;
170 };
171
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000172
173 /**
174 * Set the start position of the term.
175 *
176 * @param The start position.
177 */
178 public void setStart (int value) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000179 this.start = value;
180 };
181
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000182
183 /**
184 * Get the start position.
185 *
186 * @return The start position.
187 */
188 public int getStart () {
Nils Diewaldf399a672013-11-18 17:55:22 +0000189 return this.start;
190 };
191
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000192
193 /**
194 * Set the end position of the term.
195 *
196 * @param The end position.
197 */
198 public void setEnd (int value) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000199 this.end = value;
200 };
201
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000202
203 /**
204 * Get the end position.
205 *
206 * @return The end position.
207 */
208 public int getEnd () {
Nils Diewaldf399a672013-11-18 17:55:22 +0000209 return this.end;
210 };
211
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000212
213 /**
214 * Set the flag for stored offsets.
215 *
216 * @param value Boolean value indicating that the term
217 * contains stored offsets.
218 */
219 public void hasStoredOffsets (boolean value) {
220 this.storeOffsets = value;
221 };
222
223
224 /**
225 * Check if there are offsets stored.
226 *
227 * @return Boolean value indicating that the term
228 * contains stored offsets.
229 */
230 public boolean hasStoredOffsets () {
Nils Diewaldf399a672013-11-18 17:55:22 +0000231 return this.storeOffsets;
232 };
233
Nils Diewaldf399a672013-11-18 17:55:22 +0000234
235 private void _fromString (String term) {
236 String[] termSurface = term.split("\\$", 2);
237
238 // Payload is given
239 if (termSurface.length == 2) {
240 String payloadStr = termSurface[1];
241
242 // Payload has a type
243 if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') {
Nils Diewaldf399a672013-11-18 17:55:22 +0000244
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000245 // Rewind bytebuffer
246 bb.rewind();
247
248 // Split payload at type marker boundaries
Nils Diewaldf399a672013-11-18 17:55:22 +0000249 String[] pls = payloadStr.split("(?=<)|(?<=>)");
Nils Diewaldf399a672013-11-18 17:55:22 +0000250
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000251 l = 0; // Bytearray length
Nils Diewaldf399a672013-11-18 17:55:22 +0000252
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000253 try {
254 for (i = 1; i < pls.length;) {
255
256 // Resize the bytebuffer
257 if ((bb.capacity() - l) < 8) {
258 bb = ByteBuffer.allocate(bb.capacity() + 8)
259 .put(bb.array());
260 bb.position(l);
261 };
262
263 switch (pls[i]) {
264 case "<b>": // byte
265 bb.put(Byte.parseByte(pls[i+1]));
266 l++;
267 break;
268 case "<s>": // short
269 bb.putShort(Short.parseShort(pls[i+1]));
270 l+=2;
271 break;
272 case "<i>": // integer
273 bb.putInt(Integer.parseInt(pls[i+1]));
274 l+=4;
275 break;
276 case "<l>": // long
277 bb.putLong(Long.parseLong(pls[i+1]));
278 l+=8;
279 break;
280 };
281 i+=2;
Nils Diewaldf399a672013-11-18 17:55:22 +0000282 };
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000283
284 byte[] bytes = new byte[l];
285 System.arraycopy(bb.array(), 0, bytes, 0, l);
286 this.payload = new BytesRef(bytes);
287 }
288 catch (Exception e) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000289 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000290 }
291
292 // Payload is a string
293 else {
294 this.payload = new BytesRef(payloadStr);
295 };
296 };
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000297
298 // Parse offset information
299 stringOffset = termSurface[0].split("\\#", 2);
Nils Diewaldf399a672013-11-18 17:55:22 +0000300
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000301 if (stringOffset.length == 2) {
302
303 // Split start and end position of the offset
304 String[] offset = stringOffset[1].split("\\-", 2);
305
306 // Start and end is given
Nils Diewaldf399a672013-11-18 17:55:22 +0000307 if (offset.length == 2 && offset[0].length() > 0) {
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000308 try {
309 this.start = Integer.parseInt(offset[0]);
310 this.end = Integer.parseInt(offset[1]);
311
312 }
313 catch (NumberFormatException e) {
314 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000315 };
316 };
317 this.term = stringOffset[0];
318 };
319
320
321 /**
322 * Represent the MultiTerm as a string.
323 * Offsets are attached following a hash sign,
324 * payloads are attached following a dollar sign.
325 * All payloads are written as UTF-8 character sequences.
326 *
327 * @see #toStringShort().
328 */
329 public String toString () {
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000330
Nils Diewaldf399a672013-11-18 17:55:22 +0000331 StringBuilder sb = new StringBuilder(this.term);
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000332
Nils Diewaldf399a672013-11-18 17:55:22 +0000333 if (this.start != this.end) {
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000334 sb.append('#')
335 .append(this.start)
336 .append('-')
337 .append(this.end);
Nils Diewaldf399a672013-11-18 17:55:22 +0000338 };
339
340 if (this.payload != null) {
341 sb.append('$');
342 try {
343 sb.append(this.payload.utf8ToString());
344 }
345 catch (AssertionError e) {
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000346 sb.append("<?>")
347 .append(this.payload.toString().replace(' ', ','));
Nils Diewaldf399a672013-11-18 17:55:22 +0000348 };
349 };
350
351 return sb.toString();
352 };
353
354 /**
355 * Represent the MultiTerm as a string.
356 * Payloads are attached following a dollar sign.
357 * All payloads are written as UTF-8 character sequences.
358 * Offsets are neglected.
359 *
360 * @see #toString().
361 */
362 public String toStringShort () {
363 StringBuilder sb = new StringBuilder(this.term);
364 if (this.payload != null) {
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000365 sb.append('$');
366 try {
367 sb.append(this.payload.utf8ToString());
368 }
369 catch (AssertionError e) {
370 sb.append("<?>")
371 .append(this.payload.toString().replace(' ', ','));
372 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000373 };
374 return sb.toString();
375 };
376};