| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.analysis; |
| 2 | |
| 3 | import static de.ids_mannheim.korap.util.KorapArray.*; |
| 4 | import org.apache.lucene.util.BytesRef; |
| 5 | import java.nio.ByteBuffer; |
| 6 | import java.util.*; |
| 7 | |
| 8 | |
| 9 | /** |
| 10 | * @author Nils Diewald |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 11 | * @version 0.3 |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 12 | * |
| 13 | * MultiTerm represents a term in a MultiTermToken. |
| 14 | */ |
| 15 | public class MultiTerm { |
| 16 | public int start, end = 0; |
| 17 | public String term = null; |
| 18 | public Integer posIncr = 1; |
| 19 | public boolean storeOffsets = false; |
| 20 | public BytesRef payload = null; |
| 21 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 22 | private static ByteBuffer bb = ByteBuffer.allocate(8); |
| 23 | private static String[] stringOffset; |
| 24 | |
| 25 | private static short i, l; |
| 26 | |
| 27 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 28 | /** |
| 29 | * The constructor. |
| 30 | * |
| 31 | * @param term The term surface. |
| 32 | Offsets can be written as an appended and dash separated pair of integers, |
| 33 | payloads can be written following a dollar sign. |
| 34 | payloads can be typed as being a short (s), an integer (i), or a long (l) |
| 35 | in leading angular brackets. All other payloads are treated as being UTF-8 |
| 36 | characer sequences. |
| 37 | |
| 38 | Examples: |
| 39 | MultiTerm test = new MultiTerm("test"); |
| 40 | MultiTerm test = new MultiTerm("test#0-4"); |
| 41 | MultiTerm test = new MultiTerm("test#0-4$Example"); |
| 42 | MultiTerm test = new MultiTerm("test#0-4$<i>1278"); |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 43 | |
| 44 | Strings that are malformed fail silently. |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 45 | */ |
| 46 | public MultiTerm (String term) { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 47 | _fromString(term); |
| 48 | }; |
| 49 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 50 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 51 | /** |
| 52 | * The constructor with a separated prefix. |
| 53 | * new MultiTerm('a', "bcd") is equivalent to |
| 54 | * new MultiTerm("a:bcd"); |
| 55 | * |
| 56 | * @param prefix A special prefix for the term. |
| 57 | * @param term The term surface. |
| 58 | * |
| 59 | * @see #MultiTerm(String) |
| 60 | */ |
| 61 | public MultiTerm (char prefix, String term) { |
| 62 | StringBuilder sb = new StringBuilder(); |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 63 | _fromString(sb.append(prefix).append(':').append(term).toString()); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 64 | }; |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 65 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 66 | /** |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 67 | * The empty constructor. |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 68 | */ |
| 69 | public MultiTerm () { |
| 70 | this.term = ""; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 71 | }; |
| 72 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 73 | |
| 74 | /** |
| 75 | * Sets the term value. |
| 76 | * |
| 77 | * @param term The term as a string |
| 78 | */ |
| 79 | public void setTerm (String term) { |
| 80 | this.term = term; |
| 81 | }; |
| 82 | |
| 83 | |
| 84 | /** |
| 85 | * Returns the term value. |
| 86 | * |
| 87 | * @return The term value. |
| 88 | */ |
| 89 | public String getTerm () { |
| 90 | return this.term; |
| 91 | }; |
| 92 | |
| 93 | |
| 94 | /** |
| 95 | * Set the payload as a byte value. |
| 96 | * |
| 97 | * @param pl The payload. |
| 98 | */ |
| 99 | public void setPayload (Byte pl) { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 100 | this.payload = new BytesRef( ByteBuffer.allocate(1).put(pl).array()); |
| 101 | }; |
| 102 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 103 | |
| 104 | /** |
| 105 | * Set the payload as a short value. |
| 106 | * |
| 107 | * @param pl The payload. |
| 108 | */ |
| 109 | public void setPayload (short pl) { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 110 | this.payload = new BytesRef( ByteBuffer.allocate(2).putShort(pl).array()); |
| 111 | }; |
| 112 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 113 | |
| 114 | /** |
| 115 | * Set the payload as an integer value. |
| 116 | * |
| 117 | * @param pl The payload. |
| 118 | */ |
| 119 | public void setPayload (int pl) { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 120 | this.payload = new BytesRef( ByteBuffer.allocate(4).putInt(pl).array()); |
| 121 | }; |
| 122 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 123 | |
| 124 | /** |
| 125 | * Set the payload as a long value. |
| 126 | * |
| 127 | * @param pl The payload. |
| 128 | */ |
| 129 | public void setPayload (long pl) { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 130 | this.payload = new BytesRef( ByteBuffer.allocate(8).putLong(pl).array()); |
| 131 | }; |
| 132 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 133 | |
| 134 | /** |
| 135 | * Set the payload as a string value. |
| 136 | * |
| 137 | * @param pl The payload. |
| 138 | */ |
| 139 | public void setPayload (String pl) { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 140 | this.payload = new BytesRef(pl); |
| 141 | }; |
| 142 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 143 | |
| 144 | /** |
| 145 | * Set the payload as a byte array. |
| 146 | * |
| 147 | * @param pl The payload. |
| 148 | */ |
| 149 | public void setPayload (byte[] pl) { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 150 | this.payload = new BytesRef(pl); |
| 151 | }; |
| 152 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 153 | |
| 154 | /** |
| 155 | * Set the payload as a BytesRef. |
| 156 | * |
| 157 | * @param pl The payload. |
| 158 | */ |
| 159 | public void setPayload (BytesRef pl) { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 160 | this.payload = pl; |
| 161 | }; |
| 162 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 163 | /** |
| 164 | * Get the payload. |
| 165 | * |
| 166 | * @return The payload as a BytesRef. |
| 167 | */ |
| 168 | public BytesRef getPayload () { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 169 | return this.payload; |
| 170 | }; |
| 171 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 172 | |
| 173 | /** |
| 174 | * Set the start position of the term. |
| 175 | * |
| 176 | * @param The start position. |
| 177 | */ |
| 178 | public void setStart (int value) { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 179 | this.start = value; |
| 180 | }; |
| 181 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 182 | |
| 183 | /** |
| 184 | * Get the start position. |
| 185 | * |
| 186 | * @return The start position. |
| 187 | */ |
| 188 | public int getStart () { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 189 | return this.start; |
| 190 | }; |
| 191 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 192 | |
| 193 | /** |
| 194 | * Set the end position of the term. |
| 195 | * |
| 196 | * @param The end position. |
| 197 | */ |
| 198 | public void setEnd (int value) { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 199 | this.end = value; |
| 200 | }; |
| 201 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 202 | |
| 203 | /** |
| 204 | * Get the end position. |
| 205 | * |
| 206 | * @return The end position. |
| 207 | */ |
| 208 | public int getEnd () { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 209 | return this.end; |
| 210 | }; |
| 211 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 212 | |
| 213 | /** |
| 214 | * Set the flag for stored offsets. |
| 215 | * |
| 216 | * @param value Boolean value indicating that the term |
| 217 | * contains stored offsets. |
| 218 | */ |
| 219 | public void hasStoredOffsets (boolean value) { |
| 220 | this.storeOffsets = value; |
| 221 | }; |
| 222 | |
| 223 | |
| 224 | /** |
| 225 | * Check if there are offsets stored. |
| 226 | * |
| 227 | * @return Boolean value indicating that the term |
| 228 | * contains stored offsets. |
| 229 | */ |
| 230 | public boolean hasStoredOffsets () { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 231 | return this.storeOffsets; |
| 232 | }; |
| 233 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 234 | |
| 235 | private void _fromString (String term) { |
| 236 | String[] termSurface = term.split("\\$", 2); |
| 237 | |
| 238 | // Payload is given |
| 239 | if (termSurface.length == 2) { |
| 240 | String payloadStr = termSurface[1]; |
| 241 | |
| 242 | // Payload has a type |
| 243 | if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 244 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 245 | // Rewind bytebuffer |
| 246 | bb.rewind(); |
| 247 | |
| 248 | // Split payload at type marker boundaries |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 249 | String[] pls = payloadStr.split("(?=<)|(?<=>)"); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 250 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 251 | l = 0; // Bytearray length |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 252 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 253 | try { |
| 254 | for (i = 1; i < pls.length;) { |
| 255 | |
| 256 | // Resize the bytebuffer |
| 257 | if ((bb.capacity() - l) < 8) { |
| 258 | bb = ByteBuffer.allocate(bb.capacity() + 8) |
| 259 | .put(bb.array()); |
| 260 | bb.position(l); |
| 261 | }; |
| 262 | |
| 263 | switch (pls[i]) { |
| 264 | case "<b>": // byte |
| 265 | bb.put(Byte.parseByte(pls[i+1])); |
| 266 | l++; |
| 267 | break; |
| 268 | case "<s>": // short |
| 269 | bb.putShort(Short.parseShort(pls[i+1])); |
| 270 | l+=2; |
| 271 | break; |
| 272 | case "<i>": // integer |
| 273 | bb.putInt(Integer.parseInt(pls[i+1])); |
| 274 | l+=4; |
| 275 | break; |
| 276 | case "<l>": // long |
| 277 | bb.putLong(Long.parseLong(pls[i+1])); |
| 278 | l+=8; |
| 279 | break; |
| 280 | }; |
| 281 | i+=2; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 282 | }; |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 283 | |
| 284 | byte[] bytes = new byte[l]; |
| 285 | System.arraycopy(bb.array(), 0, bytes, 0, l); |
| 286 | this.payload = new BytesRef(bytes); |
| 287 | } |
| 288 | catch (Exception e) { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 289 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 290 | } |
| 291 | |
| 292 | // Payload is a string |
| 293 | else { |
| 294 | this.payload = new BytesRef(payloadStr); |
| 295 | }; |
| 296 | }; |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 297 | |
| 298 | // Parse offset information |
| 299 | stringOffset = termSurface[0].split("\\#", 2); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 300 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 301 | if (stringOffset.length == 2) { |
| 302 | |
| 303 | // Split start and end position of the offset |
| 304 | String[] offset = stringOffset[1].split("\\-", 2); |
| 305 | |
| 306 | // Start and end is given |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 307 | if (offset.length == 2 && offset[0].length() > 0) { |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 308 | try { |
| 309 | this.start = Integer.parseInt(offset[0]); |
| 310 | this.end = Integer.parseInt(offset[1]); |
| 311 | |
| 312 | } |
| 313 | catch (NumberFormatException e) { |
| 314 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 315 | }; |
| 316 | }; |
| 317 | this.term = stringOffset[0]; |
| 318 | }; |
| 319 | |
| 320 | |
| 321 | /** |
| 322 | * Represent the MultiTerm as a string. |
| 323 | * Offsets are attached following a hash sign, |
| 324 | * payloads are attached following a dollar sign. |
| 325 | * All payloads are written as UTF-8 character sequences. |
| 326 | * |
| 327 | * @see #toStringShort(). |
| 328 | */ |
| 329 | public String toString () { |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 330 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 331 | StringBuilder sb = new StringBuilder(this.term); |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 332 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 333 | if (this.start != this.end) { |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 334 | sb.append('#') |
| 335 | .append(this.start) |
| 336 | .append('-') |
| 337 | .append(this.end); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 338 | }; |
| 339 | |
| 340 | if (this.payload != null) { |
| 341 | sb.append('$'); |
| 342 | try { |
| 343 | sb.append(this.payload.utf8ToString()); |
| 344 | } |
| 345 | catch (AssertionError e) { |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 346 | sb.append("<?>") |
| 347 | .append(this.payload.toString().replace(' ', ',')); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 348 | }; |
| 349 | }; |
| 350 | |
| 351 | return sb.toString(); |
| 352 | }; |
| 353 | |
| 354 | /** |
| 355 | * Represent the MultiTerm as a string. |
| 356 | * Payloads are attached following a dollar sign. |
| 357 | * All payloads are written as UTF-8 character sequences. |
| 358 | * Offsets are neglected. |
| 359 | * |
| 360 | * @see #toString(). |
| 361 | */ |
| 362 | public String toStringShort () { |
| 363 | StringBuilder sb = new StringBuilder(this.term); |
| 364 | if (this.payload != null) { |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 365 | sb.append('$'); |
| 366 | try { |
| 367 | sb.append(this.payload.utf8ToString()); |
| 368 | } |
| 369 | catch (AssertionError e) { |
| 370 | sb.append("<?>") |
| 371 | .append(this.payload.toString().replace(' ', ',')); |
| 372 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 373 | }; |
| 374 | return sb.toString(); |
| 375 | }; |
| 376 | }; |