blob: 60b177d955ac8ebdd025cf486c1e66559a591cba [file] [log] [blame]
Nils Diewald2cd1c3d2014-01-08 22:53:08 +00001package de.ids_mannheim.korap.index;
2
3import java.util.*;
Nils Diewald138e5b92014-01-10 21:15:13 +00004import java.nio.ByteBuffer;
Nils Diewaldcde69082014-01-16 15:46:48 +00005import java.lang.StringBuffer;
Nils Diewald138e5b92014-01-10 21:15:13 +00006import java.util.regex.*;
Nils Diewald392bcf32015-02-26 20:01:17 +00007import de.ids_mannheim.korap.response.Match;
Nils Diewald2cd1c3d2014-01-08 22:53:08 +00008
Nils Diewald138e5b92014-01-10 21:15:13 +00009import org.slf4j.Logger;
10import org.slf4j.LoggerFactory;
Nils Diewald2cd1c3d2014-01-08 22:53:08 +000011
Nils Diewald138e5b92014-01-10 21:15:13 +000012public class TermInfo implements Comparable<TermInfo> {
13
14 // Logger
Nils Diewald392bcf32015-02-26 20:01:17 +000015 private final static Logger log = LoggerFactory.getLogger(Match.class);
Nils Diewald82a4b862014-02-20 21:17:41 +000016 // This advices the java compiler to ignore all loggings
17 public static final boolean DEBUG = false;
18
Akron6d2c4692016-02-03 18:29:10 +010019 // TODO: Support various terms - including relations!
20
Nils Diewaldcde69082014-01-16 15:46:48 +000021 private String foundry, layer, value, term, type, annotation;
Nils Diewald138e5b92014-01-10 21:15:13 +000022 // type can be "term", "pos", "span", "rel-src", "rel-target"
23
Nils Diewald2cd1c3d2014-01-08 22:53:08 +000024 private int pos = 0;
Nils Diewald138e5b92014-01-10 21:15:13 +000025 private ByteBuffer payload;
26 private boolean analyzed = false;
Nils Diewald2cd1c3d2014-01-08 22:53:08 +000027
Akrona82cee22017-09-18 14:52:12 +020028 private int
29 startChar = -1, // character offset for start of span
30 endChar = -1, // character offset for end of span
31 startPos = -1, // start position of source
32 endPos = -1, // end position of source
33 targetStartPos = -1, // start position of target
34 targetEndPos = -1; // end position of target
Nils Diewald2cd1c3d2014-01-08 22:53:08 +000035
Nils Diewald138e5b92014-01-10 21:15:13 +000036 private byte depth = (byte) 0;
Nils Diewaldbb33da22015-03-04 16:24:25 +000037
38 private Pattern prefixRegex = Pattern
Akron42993552016-02-04 13:24:24 +010039 .compile("(?:([^/]+)/)?([^:/]+)(?::(.+?))?");
Nils Diewald138e5b92014-01-10 21:15:13 +000040 private Matcher matcher;
41
Nils Diewaldbb33da22015-03-04 16:24:25 +000042
Nils Diewald138e5b92014-01-10 21:15:13 +000043 public TermInfo (String term, int pos, ByteBuffer payload) {
Nils Diewaldbb33da22015-03-04 16:24:25 +000044 this.term = term;
Nils Diewald392bcf32015-02-26 20:01:17 +000045 this.startPos = pos;
Nils Diewaldbb33da22015-03-04 16:24:25 +000046 this.endPos = pos;
47 this.payload = payload;
Nils Diewald138e5b92014-01-10 21:15:13 +000048 };
49
Akron13db6152016-02-19 14:08:38 +010050
Nils Diewald138e5b92014-01-10 21:15:13 +000051 public TermInfo analyze () {
Nils Diewald392bcf32015-02-26 20:01:17 +000052 if (analyzed)
53 return this;
Nils Diewald138e5b92014-01-10 21:15:13 +000054
Nils Diewald392bcf32015-02-26 20:01:17 +000055 int ttype = 0;
56 String tterm = this.term;
57 int lastPos = this.payload.position();
58 this.payload.rewind();
Nils Diewald138e5b92014-01-10 21:15:13 +000059
Akron6d2c4692016-02-03 18:29:10 +010060 // TODO: Use PTI!
Akronb35261a2016-02-10 20:24:24 +010061 // Add TUI and REF!
Nils Diewald392bcf32015-02-26 20:01:17 +000062 switch (tterm.charAt(0)) {
Akron42993552016-02-04 13:24:24 +010063 case '<':
64 // "<>:mate/l:..."
65 if (tterm.charAt(1) == '>') {
66 // span
67 this.type = "span";
68 tterm = tterm.substring(3);
69 ttype = 2;
70 }
71 // rel-target
72 else {
73 this.type = "relTarget";
74 tterm = tterm.substring(2);
75 ttype = 3;
Eliza Margaretha6f989202016-10-14 21:48:29 +020076 };
Akron42993552016-02-04 13:24:24 +010077 break;
78
79 case '>':
80 // rel-src
81 this.type = "relSrc";
Nils Diewald392bcf32015-02-26 20:01:17 +000082 tterm = tterm.substring(2);
83 ttype = 3;
Akron42993552016-02-04 13:24:24 +010084 break;
Nils Diewald138e5b92014-01-10 21:15:13 +000085
Akron13db6152016-02-19 14:08:38 +010086 case '_':
87 // pos
88 this.type = "pos";
89 ttype = 1;
90 tterm = tterm.substring(1);
91 break;
Akronb35261a2016-02-10 20:24:24 +010092
Akron13db6152016-02-19 14:08:38 +010093 case '@':
94 // pos
95 this.type = "attr";
96 ttype = 4;
97 tterm = tterm.substring(1);
98 break;
Nils Diewald138e5b92014-01-10 21:15:13 +000099
Akron42993552016-02-04 13:24:24 +0100100 default:
101 // term
102 this.type = "term";
Nils Diewald392bcf32015-02-26 20:01:17 +0000103 };
Nils Diewald138e5b92014-01-10 21:15:13 +0000104
Akrona82cee22017-09-18 14:52:12 +0200105 int pti = 0;
106
Nils Diewald392bcf32015-02-26 20:01:17 +0000107 // Analyze term value
108 if (ttype != 1) {
Akron5f044032015-12-18 00:35:38 +0100109
Akrona82cee22017-09-18 14:52:12 +0200110 pti = this.payload.get(); // Ignore PTI - temporary!!!
Akron5f044032015-12-18 00:35:38 +0100111
Akrona82cee22017-09-18 14:52:12 +0200112 if (DEBUG) {
113 log.trace(
114 "Check {} with {} for {}",
115 tterm,
116 pti,
117 prefixRegex.toString()
118 );
119 };
120
Nils Diewald392bcf32015-02-26 20:01:17 +0000121 matcher = prefixRegex.matcher(tterm);
Akrona82cee22017-09-18 14:52:12 +0200122
123 if (matcher.matches() && matcher.groupCount() == 3) {
Nils Diewald392bcf32015-02-26 20:01:17 +0000124 this.annotation = tterm;
125 if (matcher.group(1) != null)
126 this.foundry = matcher.group(1);
127 else
128 this.foundry = "base";
Nils Diewaldbb33da22015-03-04 16:24:25 +0000129 this.layer = matcher.group(2);
130 this.value = matcher.group(3);
Nils Diewald392bcf32015-02-26 20:01:17 +0000131 };
132 }
Nils Diewald138e5b92014-01-10 21:15:13 +0000133
Akrona82cee22017-09-18 14:52:12 +0200134 // for positions (aka offset tokens)
Nils Diewald392bcf32015-02-26 20:01:17 +0000135 else {
136 this.value = tterm;
137 this.startChar = this.payload.getInt();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000138 this.endChar = this.payload.getInt();
Nils Diewald392bcf32015-02-26 20:01:17 +0000139 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000140
Nils Diewald392bcf32015-02-26 20:01:17 +0000141 // for spans
142 if (ttype == 2) {
143 this.startChar = this.payload.getInt();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000144 this.endChar = this.payload.getInt();
Akron9ebdfab2018-02-19 16:38:17 +0100145 if (this.startChar == this.endChar)
146 this.type = "empty";
Nils Diewald392bcf32015-02-26 20:01:17 +0000147 };
Nils Diewald138e5b92014-01-10 21:15:13 +0000148
Akronb35261a2016-02-10 20:24:24 +0100149 // for spans, relations and attributes
150 if (ttype > 1 && ttype != 4) {
Akrona82cee22017-09-18 14:52:12 +0200151
152 // relSrc
Akron6d2c4692016-02-03 18:29:10 +0100153 if (this.type.equals("relTarget")) {
154 this.endPos = this.startPos;
155 this.startPos = this.payload.getInt() - 1;
156 }
Akrona82cee22017-09-18 14:52:12 +0200157
158 // Token-to-token relation
159 else if (pti == 32) {
160 /*
161 * 1 byte for PTI (32),
162 * 1 integer for the right part token position,
163 * 1 short for the left-part TUI,
164 * 1 short for right-part TUI and
165 * 1 short for the relation TUI.
166 */
Akron430703a2017-11-16 18:32:54 +0100167 this.targetStartPos = this.payload.getInt();
Akrona82cee22017-09-18 14:52:12 +0200168 }
169
170 // Token-to-span relation
171 else if (pti == 33) {
172 /*
173 * 1 byte for PTI (33),
174 * 1 integer for the start span offset of the right part,
175 * 1 integer for the end span offset of the right part,
176 * 1 integer for the start position of the right part,
177 * 1 integer for the end position of the right part,
178 * and 0-3 TUIs as above.
179 */
180 // Ignore offsets
181 this.payload.getInt();
182 this.payload.getInt();
183
184 this.endPos = this.startPos;
185 this.targetStartPos = this.payload.getInt();
186 this.targetEndPos = this.payload.getInt();
187 }
Akron652e4362017-09-18 20:14:44 +0200188
189 // Span to token
Akrona82cee22017-09-18 14:52:12 +0200190 else if (pti == 34) {
191 /*
192 * 1 byte for PTI (34),
193 * 1 integer for the start span offset of the left part,
194 * 1 integer for the end span offset of the left part,
195 * 1 integer for end position of the left part,
196 * 1 integer for end position of the right part, and
197 * and 0-3 TUIs as above.
198 */
Akron652e4362017-09-18 20:14:44 +0200199
200 // Ignore offsets
201 this.payload.getInt();
202 this.endPos = this.payload.getInt();
203 this.targetStartPos = this.payload.getInt();
Akrona82cee22017-09-18 14:52:12 +0200204 }
205 else if (pti == 35) {
206 /*
207 * 1 byte for PTI (35),
208 * 1 integer for the start span offset of the left part,
209 * 1 integer for the end span offset of the left part,
210 * 1 integer for the start span offset of the right part,
211 * 1 integer for the end span offset of the right part,
212 * 1 integer for end position of the left part,
213 * 1 integer for the start position of the right part,
214 * 1 integer for end position of the right part,
215 * and 0-3 TUIs as above.
216 */
Akron652e4362017-09-18 20:14:44 +0200217
218 // Ignore offsets
219 this.payload.getInt();
220 this.payload.getInt();
221 this.payload.getInt();
222 this.payload.getInt();
223
224 this.endPos = this.payload.getInt();
225 this.targetStartPos = this.payload.getInt();
226 this.targetEndPos = this.payload.getInt();
Akrona82cee22017-09-18 14:52:12 +0200227 }
Akron6d2c4692016-02-03 18:29:10 +0100228 else {
229 this.endPos = this.payload.getInt() - 1;
230 };
Akron5f044032015-12-18 00:35:38 +0100231 };
232
233 // Ignore link id for the moment
Nils Diewald392bcf32015-02-26 20:01:17 +0000234 if (ttype == 2 && this.payload.position() < lastPos) {
235 this.depth = this.payload.get();
236 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000237
Akrona82cee22017-09-18 14:52:12 +0200238 /*
239 * TODO:
240 * Analyze TUI for attributes
241 */
242
Nils Diewald392bcf32015-02-26 20:01:17 +0000243 // payloads can have different meaning
244 analyzed = true;
245 return this;
Nils Diewald138e5b92014-01-10 21:15:13 +0000246 };
247
Nils Diewaldbb33da22015-03-04 16:24:25 +0000248
Nils Diewald138e5b92014-01-10 21:15:13 +0000249 public String getType () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000250 return this.type;
Nils Diewald138e5b92014-01-10 21:15:13 +0000251 };
252
Nils Diewaldbb33da22015-03-04 16:24:25 +0000253
Nils Diewald138e5b92014-01-10 21:15:13 +0000254 public int getStartChar () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000255 return this.startChar;
Nils Diewald138e5b92014-01-10 21:15:13 +0000256 };
257
Nils Diewaldbb33da22015-03-04 16:24:25 +0000258
Nils Diewald138e5b92014-01-10 21:15:13 +0000259 public void setStartChar (int pos) {
260 this.startChar = pos;
261 };
262
Nils Diewaldbb33da22015-03-04 16:24:25 +0000263
Nils Diewald138e5b92014-01-10 21:15:13 +0000264 public int getEndChar () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000265 return this.endChar;
Nils Diewald138e5b92014-01-10 21:15:13 +0000266 };
267
Nils Diewaldbb33da22015-03-04 16:24:25 +0000268
Nils Diewald138e5b92014-01-10 21:15:13 +0000269 public void setEndChar (int pos) {
270 this.endChar = pos;
271 };
272
Nils Diewaldbb33da22015-03-04 16:24:25 +0000273
Nils Diewald138e5b92014-01-10 21:15:13 +0000274 public int getStartPos () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000275 return this.startPos;
Nils Diewald138e5b92014-01-10 21:15:13 +0000276 };
277
Nils Diewaldbb33da22015-03-04 16:24:25 +0000278
Nils Diewald138e5b92014-01-10 21:15:13 +0000279 public int getEndPos () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000280 return this.endPos;
Nils Diewald138e5b92014-01-10 21:15:13 +0000281 };
282
Akrona82cee22017-09-18 14:52:12 +0200283 public int getTargetStartPos () {
284 return this.targetStartPos;
285 };
286
287
288 public int getTargetEndPos () {
289 return this.targetEndPos;
290 };
291
Nils Diewaldbb33da22015-03-04 16:24:25 +0000292
Nils Diewald138e5b92014-01-10 21:15:13 +0000293 public byte getDepth () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000294 return this.depth;
Nils Diewald138e5b92014-01-10 21:15:13 +0000295 };
296
Nils Diewaldbb33da22015-03-04 16:24:25 +0000297
Nils Diewald138e5b92014-01-10 21:15:13 +0000298 public String getFoundry () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000299 return this.foundry;
Nils Diewald138e5b92014-01-10 21:15:13 +0000300 };
301
Nils Diewaldbb33da22015-03-04 16:24:25 +0000302
Nils Diewald138e5b92014-01-10 21:15:13 +0000303 public String getLayer () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000304 return this.layer;
Nils Diewald138e5b92014-01-10 21:15:13 +0000305 };
306
Nils Diewaldbb33da22015-03-04 16:24:25 +0000307
Nils Diewald138e5b92014-01-10 21:15:13 +0000308 public String getValue () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000309 return this.value;
Nils Diewald138e5b92014-01-10 21:15:13 +0000310 };
311
Nils Diewaldbb33da22015-03-04 16:24:25 +0000312
Nils Diewaldcde69082014-01-16 15:46:48 +0000313 public String getAnnotation () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000314 return this.annotation;
Nils Diewaldcde69082014-01-16 15:46:48 +0000315 };
316
Nils Diewaldbb33da22015-03-04 16:24:25 +0000317
Nils Diewald345bdc02014-01-21 21:48:57 +0000318 public String toString () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000319 this.analyze();
Nils Diewald345bdc02014-01-21 21:48:57 +0000320
Nils Diewald392bcf32015-02-26 20:01:17 +0000321 StringBuffer sb = new StringBuffer();
322 sb.append('<').append(this.getType()).append('>');
323 sb.append(this.getFoundry()).append('/').append(this.getLayer());
Nils Diewald345bdc02014-01-21 21:48:57 +0000324
Nils Diewald392bcf32015-02-26 20:01:17 +0000325 if (this.getValue() != null)
326 sb.append(':').append(this.getValue());
Nils Diewald345bdc02014-01-21 21:48:57 +0000327
Nils Diewald392bcf32015-02-26 20:01:17 +0000328 if (this.getDepth() != (byte) 0)
329 sb.append('(').append(this.getDepth()).append(')');
Nils Diewald345bdc02014-01-21 21:48:57 +0000330
Nils Diewald392bcf32015-02-26 20:01:17 +0000331 sb.append('[').append(this.getStartPos());
332 sb.append('-').append(this.getEndPos()).append(']');
333 sb.append('[').append(this.getStartChar());
334 sb.append('-').append(this.getEndChar()).append(']');
Nils Diewald345bdc02014-01-21 21:48:57 +0000335
Nils Diewald392bcf32015-02-26 20:01:17 +0000336 return sb.toString();
Nils Diewald345bdc02014-01-21 21:48:57 +0000337 };
338
Nils Diewaldbb33da22015-03-04 16:24:25 +0000339
Nils Diewald138e5b92014-01-10 21:15:13 +0000340 @Override
Akron42993552016-02-04 13:24:24 +0100341 public int compareTo (TermInfo obj) {
Nils Diewald392bcf32015-02-26 20:01:17 +0000342 this.analyze();
343 obj.analyze();
Nils Diewald138e5b92014-01-10 21:15:13 +0000344
Nils Diewald392bcf32015-02-26 20:01:17 +0000345 // TODO: This sorting does not seem to work!
346 // although it might only be important for depth stuff.
Nils Diewaldcde69082014-01-16 15:46:48 +0000347
Nils Diewald392bcf32015-02-26 20:01:17 +0000348 if (this.startChar < obj.startChar) {
349 return -1;
350 }
351 else if (this.startChar > obj.startChar) {
352 return 1;
353 }
354 else if (this.depth < obj.depth) {
355 return 1;
356 }
357 else if (this.depth > obj.depth) {
358 return -1;
359 };
360 return 0;
Nils Diewald2cd1c3d2014-01-08 22:53:08 +0000361 };
362};