blob: 9c9e306719b40709248f13c8e5bb64a5e08c3c8e [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap.analysis;
2
3import static de.ids_mannheim.korap.util.KorapArray.*;
4import org.apache.lucene.util.BytesRef;
5import java.nio.ByteBuffer;
6import java.util.*;
7
8
9/**
10 * @author Nils Diewald
11 * @version 0.2
12 *
13 * MultiTerm represents a term in a MultiTermToken.
14 */
15public class MultiTerm {
16 public int start, end = 0;
17 public String term = null;
18 public Integer posIncr = 1;
19 public boolean storeOffsets = false;
20 public BytesRef payload = null;
21
22 /**
23 * The constructor.
24 *
25 * @param term The term surface.
26 Offsets can be written as an appended and dash separated pair of integers,
27 payloads can be written following a dollar sign.
28 payloads can be typed as being a short (s), an integer (i), or a long (l)
29 in leading angular brackets. All other payloads are treated as being UTF-8
30 characer sequences.
31
32 Examples:
33 MultiTerm test = new MultiTerm("test");
34 MultiTerm test = new MultiTerm("test#0-4");
35 MultiTerm test = new MultiTerm("test#0-4$Example");
36 MultiTerm test = new MultiTerm("test#0-4$<i>1278");
37 */
38 public MultiTerm (String term) {
39 /*
40 this.start = this.end = 0;
41 this.storeOffsets = false;
42 this.payload = null;
43 */
44 _fromString(term);
45 };
46
47 /**
48 * The constructor with a separated prefix.
49 * new MultiTerm('a', "bcd") is equivalent to
50 * new MultiTerm("a:bcd");
51 *
52 * @param prefix A special prefix for the term.
53 * @param term The term surface.
54 *
55 * @see #MultiTerm(String)
56 */
57 public MultiTerm (char prefix, String term) {
58 StringBuilder sb = new StringBuilder();
59 /*
60 this.start = this.end = 0;
61 this.storeOffsets = false;
62 this.payload = null;
63 */
64 sb.append(prefix).append(':').append(term);
65 _fromString(sb.toString());
66 };
67
68 public void term (String term) {
69 this.term = term;
70 };
71
72 public String term () {
73 return this.term;
74 };
75
76 /**
77 * The constructor.
78 */
79 public MultiTerm () {
80 this.term = "";
81 /*
82 this.start = this.end = 0;
83 this.storeOffsets = false;
84 this.payload = null;
85 */
86 };
87
88 public void payload (Byte pl) {
89 this.payload = new BytesRef( ByteBuffer.allocate(1).put(pl).array());
90 };
91
92 public void payload (short pl) {
93 this.payload = new BytesRef( ByteBuffer.allocate(2).putShort(pl).array());
94 };
95
96 public void payload (int pl) {
97 this.payload = new BytesRef( ByteBuffer.allocate(4).putInt(pl).array());
98 };
99
100 public void payload (long pl) {
101 this.payload = new BytesRef( ByteBuffer.allocate(8).putLong(pl).array());
102 };
103
104 public void payload (String pl) {
105 this.payload = new BytesRef(pl);
106 };
107
108 public void payload (byte[] pl) {
109 this.payload = new BytesRef(pl);
110 };
111
112 public void payload (BytesRef pl) {
113 this.payload = pl;
114 };
115
116 public BytesRef payload () {
117 return this.payload;
118 };
119
120 public void start (int value) {
121 this.start = value;
122 };
123
124 public int start () {
125 return this.start;
126 };
127
128 public void end (int value) {
129 this.end = value;
130 };
131
132 public int end () {
133 return this.end;
134 };
135
136 public boolean storeOffsets () {
137 return this.storeOffsets;
138 };
139
140 public void storeOffsets (boolean value) {
141 this.storeOffsets = value;
142 };
143
144 private void _fromString (String term) {
145 String[] termSurface = term.split("\\$", 2);
146
147 // Payload is given
148 if (termSurface.length == 2) {
149 String payloadStr = termSurface[1];
150
151 // Payload has a type
152 if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') {
153 ByteBuffer bb = ByteBuffer.allocate(8);
154
155 String[] pls = payloadStr.split("(?=<)|(?<=>)");
156 int l = 0;
157
158 for (int i = 1; i < pls.length;) {
159
160 // Resize the buffer
161 if ((bb.capacity() - l) < 8) {
162 bb = ByteBuffer.allocate(bb.capacity() + 8).put(bb.array());
163 bb.position(l);
164 };
165 switch (pls[i]) {
166 case "<b>": // byte
167 bb.put(Byte.parseByte(pls[i+1]));
168 l++;
169 break;
170 case "<s>":
171 bb.putShort(Short.parseShort(pls[i+1]));
172 l+=2;
173 break;
174 case "<i>":
175 bb.putInt(Integer.parseInt(pls[i+1]));
176 l+=4;
177 break;
178 case "<l>":
179 bb.putLong(Long.parseLong(pls[i+1]));
180 l+=8;
181 break;
182 };
183 i+=2;
184 };
185 byte[] bytes = new byte[l];
186 System.arraycopy(bb.array(), 0, bytes, 0, l);
187 this.payload = new BytesRef(bytes);
188
189
190 /*
191 payloadStr = payloadStr.substring(3, payloadStr.length());
192 switch (type) {
193 case 'b': // byte
194
195 System.err.println("bbb");
196 payloadBytes = ByteBuffer.allocate(1).put(new Byte(payloadStr)).array();
197 break;
198 case 's': // short
199 payloadBytes = ByteBuffer.allocate(2).putShort(
200 Short.parseShort(payloadStr)
201 ).array();
202 break;
203 case 'i': // integer
204 payloadBytes = ByteBuffer.allocate(4).putInt(
205 Integer.parseInt(payloadStr)
206 ).array();
207 break;
208 case 'l': // long
209 payloadBytes = ByteBuffer.allocate(8).putLong(
210 Long.parseLong(payloadStr)
211 ).array();
212 break;
213 };
214 TODO:
215 case '?': // arbitrary
216 payloadStr =
217 */
218 }
219
220 // Payload is a string
221 else {
222 this.payload = new BytesRef(payloadStr);
223 };
224 };
225 String[] stringOffset = termSurface[0].split("\\#", 2);
226 if (stringOffset.length == 2) {
227 String[] offset = stringOffset[1].split("\\-", 2);
228
229 if (offset.length == 2 && offset[0].length() > 0) {
230 this.start = Integer.parseInt(offset[0]);
231 this.end = Integer.parseInt(offset[1]);
232 /*
233 }
234 else {
235 this.storeOffsets(false);
236 */
237 };
238 };
239 this.term = stringOffset[0];
240 };
241
242
243 /**
244 * Represent the MultiTerm as a string.
245 * Offsets are attached following a hash sign,
246 * payloads are attached following a dollar sign.
247 * All payloads are written as UTF-8 character sequences.
248 *
249 * @see #toStringShort().
250 */
251 public String toString () {
252 StringBuilder sb = new StringBuilder(this.term);
253 if (this.start != this.end) {
254 sb.append('#').append(this.start).append('-').append(this.end);
255 /*
256 }
257 else if (!this.storeOffsets()) {
258 sb.append("#-");
259 */
260 };
261
262 if (this.payload != null) {
263 sb.append('$');
264 try {
265 sb.append(this.payload.utf8ToString());
266 }
267 catch (AssertionError e) {
268 sb.append("<?>").append(join(',', this.payload.toString().split(" ")));
269 };
270 };
271
272 return sb.toString();
273 };
274
275 /**
276 * Represent the MultiTerm as a string.
277 * Payloads are attached following a dollar sign.
278 * All payloads are written as UTF-8 character sequences.
279 * Offsets are neglected.
280 *
281 * @see #toString().
282 */
283 public String toStringShort () {
284 StringBuilder sb = new StringBuilder(this.term);
285 if (this.payload != null) {
286 sb.append('$').append(this.payload.utf8ToString());
287 };
288 return sb.toString();
289 };
290};