| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.analysis; |
| 2 | |
| 3 | import static de.ids_mannheim.korap.util.KorapArray.*; |
| 4 | import org.apache.lucene.util.BytesRef; |
| 5 | import java.nio.ByteBuffer; |
| 6 | import java.util.*; |
| 7 | |
| 8 | |
| 9 | /** |
| 10 | * @author Nils Diewald |
| 11 | * @version 0.2 |
| 12 | * |
| 13 | * MultiTerm represents a term in a MultiTermToken. |
| 14 | */ |
| 15 | public class MultiTerm { |
| 16 | public int start, end = 0; |
| 17 | public String term = null; |
| 18 | public Integer posIncr = 1; |
| 19 | public boolean storeOffsets = false; |
| 20 | public BytesRef payload = null; |
| 21 | |
| 22 | /** |
| 23 | * The constructor. |
| 24 | * |
| 25 | * @param term The term surface. |
| 26 | Offsets can be written as an appended and dash separated pair of integers, |
| 27 | payloads can be written following a dollar sign. |
| 28 | payloads can be typed as being a short (s), an integer (i), or a long (l) |
| 29 | in leading angular brackets. All other payloads are treated as being UTF-8 |
| 30 | characer sequences. |
| 31 | |
| 32 | Examples: |
| 33 | MultiTerm test = new MultiTerm("test"); |
| 34 | MultiTerm test = new MultiTerm("test#0-4"); |
| 35 | MultiTerm test = new MultiTerm("test#0-4$Example"); |
| 36 | MultiTerm test = new MultiTerm("test#0-4$<i>1278"); |
| 37 | */ |
| 38 | public MultiTerm (String term) { |
| 39 | /* |
| 40 | this.start = this.end = 0; |
| 41 | this.storeOffsets = false; |
| 42 | this.payload = null; |
| 43 | */ |
| 44 | _fromString(term); |
| 45 | }; |
| 46 | |
| 47 | /** |
| 48 | * The constructor with a separated prefix. |
| 49 | * new MultiTerm('a', "bcd") is equivalent to |
| 50 | * new MultiTerm("a:bcd"); |
| 51 | * |
| 52 | * @param prefix A special prefix for the term. |
| 53 | * @param term The term surface. |
| 54 | * |
| 55 | * @see #MultiTerm(String) |
| 56 | */ |
| 57 | public MultiTerm (char prefix, String term) { |
| 58 | StringBuilder sb = new StringBuilder(); |
| 59 | /* |
| 60 | this.start = this.end = 0; |
| 61 | this.storeOffsets = false; |
| 62 | this.payload = null; |
| 63 | */ |
| 64 | sb.append(prefix).append(':').append(term); |
| 65 | _fromString(sb.toString()); |
| 66 | }; |
| 67 | |
| 68 | public void term (String term) { |
| 69 | this.term = term; |
| 70 | }; |
| 71 | |
| 72 | public String term () { |
| 73 | return this.term; |
| 74 | }; |
| 75 | |
| 76 | /** |
| 77 | * The constructor. |
| 78 | */ |
| 79 | public MultiTerm () { |
| 80 | this.term = ""; |
| 81 | /* |
| 82 | this.start = this.end = 0; |
| 83 | this.storeOffsets = false; |
| 84 | this.payload = null; |
| 85 | */ |
| 86 | }; |
| 87 | |
| 88 | public void payload (Byte pl) { |
| 89 | this.payload = new BytesRef( ByteBuffer.allocate(1).put(pl).array()); |
| 90 | }; |
| 91 | |
| 92 | public void payload (short pl) { |
| 93 | this.payload = new BytesRef( ByteBuffer.allocate(2).putShort(pl).array()); |
| 94 | }; |
| 95 | |
| 96 | public void payload (int pl) { |
| 97 | this.payload = new BytesRef( ByteBuffer.allocate(4).putInt(pl).array()); |
| 98 | }; |
| 99 | |
| 100 | public void payload (long pl) { |
| 101 | this.payload = new BytesRef( ByteBuffer.allocate(8).putLong(pl).array()); |
| 102 | }; |
| 103 | |
| 104 | public void payload (String pl) { |
| 105 | this.payload = new BytesRef(pl); |
| 106 | }; |
| 107 | |
| 108 | public void payload (byte[] pl) { |
| 109 | this.payload = new BytesRef(pl); |
| 110 | }; |
| 111 | |
| 112 | public void payload (BytesRef pl) { |
| 113 | this.payload = pl; |
| 114 | }; |
| 115 | |
| 116 | public BytesRef payload () { |
| 117 | return this.payload; |
| 118 | }; |
| 119 | |
| 120 | public void start (int value) { |
| 121 | this.start = value; |
| 122 | }; |
| 123 | |
| 124 | public int start () { |
| 125 | return this.start; |
| 126 | }; |
| 127 | |
| 128 | public void end (int value) { |
| 129 | this.end = value; |
| 130 | }; |
| 131 | |
| 132 | public int end () { |
| 133 | return this.end; |
| 134 | }; |
| 135 | |
| 136 | public boolean storeOffsets () { |
| 137 | return this.storeOffsets; |
| 138 | }; |
| 139 | |
| 140 | public void storeOffsets (boolean value) { |
| 141 | this.storeOffsets = value; |
| 142 | }; |
| 143 | |
| 144 | private void _fromString (String term) { |
| 145 | String[] termSurface = term.split("\\$", 2); |
| 146 | |
| 147 | // Payload is given |
| 148 | if (termSurface.length == 2) { |
| 149 | String payloadStr = termSurface[1]; |
| 150 | |
| 151 | // Payload has a type |
| 152 | if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') { |
| 153 | ByteBuffer bb = ByteBuffer.allocate(8); |
| 154 | |
| 155 | String[] pls = payloadStr.split("(?=<)|(?<=>)"); |
| 156 | int l = 0; |
| 157 | |
| 158 | for (int i = 1; i < pls.length;) { |
| 159 | |
| 160 | // Resize the buffer |
| 161 | if ((bb.capacity() - l) < 8) { |
| 162 | bb = ByteBuffer.allocate(bb.capacity() + 8).put(bb.array()); |
| 163 | bb.position(l); |
| 164 | }; |
| 165 | switch (pls[i]) { |
| 166 | case "<b>": // byte |
| 167 | bb.put(Byte.parseByte(pls[i+1])); |
| 168 | l++; |
| 169 | break; |
| 170 | case "<s>": |
| 171 | bb.putShort(Short.parseShort(pls[i+1])); |
| 172 | l+=2; |
| 173 | break; |
| 174 | case "<i>": |
| 175 | bb.putInt(Integer.parseInt(pls[i+1])); |
| 176 | l+=4; |
| 177 | break; |
| 178 | case "<l>": |
| 179 | bb.putLong(Long.parseLong(pls[i+1])); |
| 180 | l+=8; |
| 181 | break; |
| 182 | }; |
| 183 | i+=2; |
| 184 | }; |
| 185 | byte[] bytes = new byte[l]; |
| 186 | System.arraycopy(bb.array(), 0, bytes, 0, l); |
| 187 | this.payload = new BytesRef(bytes); |
| 188 | |
| 189 | |
| 190 | /* |
| 191 | payloadStr = payloadStr.substring(3, payloadStr.length()); |
| 192 | switch (type) { |
| 193 | case 'b': // byte |
| 194 | |
| 195 | System.err.println("bbb"); |
| 196 | payloadBytes = ByteBuffer.allocate(1).put(new Byte(payloadStr)).array(); |
| 197 | break; |
| 198 | case 's': // short |
| 199 | payloadBytes = ByteBuffer.allocate(2).putShort( |
| 200 | Short.parseShort(payloadStr) |
| 201 | ).array(); |
| 202 | break; |
| 203 | case 'i': // integer |
| 204 | payloadBytes = ByteBuffer.allocate(4).putInt( |
| 205 | Integer.parseInt(payloadStr) |
| 206 | ).array(); |
| 207 | break; |
| 208 | case 'l': // long |
| 209 | payloadBytes = ByteBuffer.allocate(8).putLong( |
| 210 | Long.parseLong(payloadStr) |
| 211 | ).array(); |
| 212 | break; |
| 213 | }; |
| 214 | TODO: |
| 215 | case '?': // arbitrary |
| 216 | payloadStr = |
| 217 | */ |
| 218 | } |
| 219 | |
| 220 | // Payload is a string |
| 221 | else { |
| 222 | this.payload = new BytesRef(payloadStr); |
| 223 | }; |
| 224 | }; |
| 225 | String[] stringOffset = termSurface[0].split("\\#", 2); |
| 226 | if (stringOffset.length == 2) { |
| 227 | String[] offset = stringOffset[1].split("\\-", 2); |
| 228 | |
| 229 | if (offset.length == 2 && offset[0].length() > 0) { |
| 230 | this.start = Integer.parseInt(offset[0]); |
| 231 | this.end = Integer.parseInt(offset[1]); |
| 232 | /* |
| 233 | } |
| 234 | else { |
| 235 | this.storeOffsets(false); |
| 236 | */ |
| 237 | }; |
| 238 | }; |
| 239 | this.term = stringOffset[0]; |
| 240 | }; |
| 241 | |
| 242 | |
| 243 | /** |
| 244 | * Represent the MultiTerm as a string. |
| 245 | * Offsets are attached following a hash sign, |
| 246 | * payloads are attached following a dollar sign. |
| 247 | * All payloads are written as UTF-8 character sequences. |
| 248 | * |
| 249 | * @see #toStringShort(). |
| 250 | */ |
| 251 | public String toString () { |
| 252 | StringBuilder sb = new StringBuilder(this.term); |
| 253 | if (this.start != this.end) { |
| 254 | sb.append('#').append(this.start).append('-').append(this.end); |
| 255 | /* |
| 256 | } |
| 257 | else if (!this.storeOffsets()) { |
| 258 | sb.append("#-"); |
| 259 | */ |
| 260 | }; |
| 261 | |
| 262 | if (this.payload != null) { |
| 263 | sb.append('$'); |
| 264 | try { |
| 265 | sb.append(this.payload.utf8ToString()); |
| 266 | } |
| 267 | catch (AssertionError e) { |
| 268 | sb.append("<?>").append(join(',', this.payload.toString().split(" "))); |
| 269 | }; |
| 270 | }; |
| 271 | |
| 272 | return sb.toString(); |
| 273 | }; |
| 274 | |
| 275 | /** |
| 276 | * Represent the MultiTerm as a string. |
| 277 | * Payloads are attached following a dollar sign. |
| 278 | * All payloads are written as UTF-8 character sequences. |
| 279 | * Offsets are neglected. |
| 280 | * |
| 281 | * @see #toString(). |
| 282 | */ |
| 283 | public String toStringShort () { |
| 284 | StringBuilder sb = new StringBuilder(this.term); |
| 285 | if (this.payload != null) { |
| 286 | sb.append('$').append(this.payload.utf8ToString()); |
| 287 | }; |
| 288 | return sb.toString(); |
| 289 | }; |
| 290 | }; |