blob: 6354f822a4ac5a00a950b31c98d61c38a15d1ef0 [file] [log] [blame]
Nils Diewald2cd1c3d2014-01-08 22:53:08 +00001package de.ids_mannheim.korap.index;
2
3import java.util.*;
Nils Diewald138e5b92014-01-10 21:15:13 +00004import java.nio.ByteBuffer;
Nils Diewaldcde69082014-01-16 15:46:48 +00005import java.lang.StringBuffer;
Nils Diewald138e5b92014-01-10 21:15:13 +00006import java.util.regex.*;
Nils Diewald392bcf32015-02-26 20:01:17 +00007import de.ids_mannheim.korap.response.Match;
Nils Diewald2cd1c3d2014-01-08 22:53:08 +00008
Nils Diewald138e5b92014-01-10 21:15:13 +00009import org.slf4j.Logger;
10import org.slf4j.LoggerFactory;
Nils Diewald2cd1c3d2014-01-08 22:53:08 +000011
Nils Diewald138e5b92014-01-10 21:15:13 +000012public class TermInfo implements Comparable<TermInfo> {
13
14 // Logger
Nils Diewald392bcf32015-02-26 20:01:17 +000015 private final static Logger log = LoggerFactory.getLogger(Match.class);
Nils Diewald82a4b862014-02-20 21:17:41 +000016 // This advices the java compiler to ignore all loggings
17 public static final boolean DEBUG = false;
18
Nils Diewald138e5b92014-01-10 21:15:13 +000019
Nils Diewaldcde69082014-01-16 15:46:48 +000020 private String foundry, layer, value, term, type, annotation;
Nils Diewald138e5b92014-01-10 21:15:13 +000021 // type can be "term", "pos", "span", "rel-src", "rel-target"
22
Nils Diewald2cd1c3d2014-01-08 22:53:08 +000023 private int pos = 0;
Nils Diewald138e5b92014-01-10 21:15:13 +000024 private ByteBuffer payload;
25 private boolean analyzed = false;
Nils Diewald2cd1c3d2014-01-08 22:53:08 +000026
Nils Diewaldbb33da22015-03-04 16:24:25 +000027 private int startChar = -1, endChar = -1, startPos = -1, endPos = -1;
Nils Diewald2cd1c3d2014-01-08 22:53:08 +000028
Nils Diewald138e5b92014-01-10 21:15:13 +000029 private byte depth = (byte) 0;
Nils Diewaldbb33da22015-03-04 16:24:25 +000030
31 private Pattern prefixRegex = Pattern
32 .compile("(?:([^/]+)/)?([^:/]+)(?::(.+?))?");
Nils Diewald138e5b92014-01-10 21:15:13 +000033 private Matcher matcher;
34
Nils Diewaldbb33da22015-03-04 16:24:25 +000035
Nils Diewald138e5b92014-01-10 21:15:13 +000036 public TermInfo (String term, int pos, ByteBuffer payload) {
Nils Diewaldbb33da22015-03-04 16:24:25 +000037 this.term = term;
Nils Diewald392bcf32015-02-26 20:01:17 +000038 this.startPos = pos;
Nils Diewaldbb33da22015-03-04 16:24:25 +000039 this.endPos = pos;
40 this.payload = payload;
Nils Diewald138e5b92014-01-10 21:15:13 +000041 };
42
Nils Diewaldbb33da22015-03-04 16:24:25 +000043
Nils Diewald138e5b92014-01-10 21:15:13 +000044 public TermInfo analyze () {
Nils Diewald392bcf32015-02-26 20:01:17 +000045 if (analyzed)
46 return this;
Nils Diewald138e5b92014-01-10 21:15:13 +000047
Nils Diewald392bcf32015-02-26 20:01:17 +000048 int ttype = 0;
49 String tterm = this.term;
50 int lastPos = this.payload.position();
51 this.payload.rewind();
Nils Diewald138e5b92014-01-10 21:15:13 +000052
Nils Diewald392bcf32015-02-26 20:01:17 +000053 switch (tterm.charAt(0)) {
Nils Diewaldbb33da22015-03-04 16:24:25 +000054 case '<':
55 // "<>:mate/l:..."
56 if (tterm.charAt(1) == '>') {
57 // span
58 this.type = "span";
59 tterm = tterm.substring(3);
60 ttype = 2;
61 }
62 // rel-target
63 else {
64 this.type = "relTarget";
65 tterm = tterm.substring(2);
66 ttype = 3;
67 }
68 ;
69 break;
70
71 case '>':
72 // rel-src
73 this.type = "relSrc";
Nils Diewald392bcf32015-02-26 20:01:17 +000074 tterm = tterm.substring(2);
75 ttype = 3;
Nils Diewaldbb33da22015-03-04 16:24:25 +000076 break;
Nils Diewald138e5b92014-01-10 21:15:13 +000077
Nils Diewaldbb33da22015-03-04 16:24:25 +000078 case '_':
79 // pos
80 this.type = "pos";
81 ttype = 1;
82 tterm = tterm.substring(1);
83 break;
Nils Diewald138e5b92014-01-10 21:15:13 +000084
Nils Diewaldbb33da22015-03-04 16:24:25 +000085 default:
86 // term
87 this.type = "term";
Nils Diewald392bcf32015-02-26 20:01:17 +000088 };
Nils Diewald138e5b92014-01-10 21:15:13 +000089
Nils Diewald392bcf32015-02-26 20:01:17 +000090 // Analyze term value
91 if (ttype != 1) {
92 if (DEBUG)
93 log.trace("Check {} for {}", tterm, prefixRegex.toString());
94 matcher = prefixRegex.matcher(tterm);
95 if (matcher.matches() && matcher.groupCount() == 3) {
96 this.annotation = tterm;
97 if (matcher.group(1) != null)
98 this.foundry = matcher.group(1);
99 else
100 this.foundry = "base";
Nils Diewaldbb33da22015-03-04 16:24:25 +0000101 this.layer = matcher.group(2);
102 this.value = matcher.group(3);
Nils Diewald392bcf32015-02-26 20:01:17 +0000103 };
104 }
Nils Diewald138e5b92014-01-10 21:15:13 +0000105
Nils Diewald392bcf32015-02-26 20:01:17 +0000106 // for positions
107 else {
108 this.value = tterm;
109 this.startChar = this.payload.getInt();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000110 this.endChar = this.payload.getInt();
Nils Diewald392bcf32015-02-26 20:01:17 +0000111 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000112
Nils Diewald392bcf32015-02-26 20:01:17 +0000113 // for spans
114 if (ttype == 2) {
115 this.startChar = this.payload.getInt();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000116 this.endChar = this.payload.getInt();
Nils Diewald392bcf32015-02-26 20:01:17 +0000117 };
Nils Diewald138e5b92014-01-10 21:15:13 +0000118
Nils Diewald392bcf32015-02-26 20:01:17 +0000119 // for spans and relations
120 if (ttype > 1)
121 // Unsure if this is correct
Nils Diewaldbb33da22015-03-04 16:24:25 +0000122 this.endPos = this.payload.getInt() - 1;
123
Nils Diewald392bcf32015-02-26 20:01:17 +0000124 if (ttype == 2 && this.payload.position() < lastPos) {
125 this.depth = this.payload.get();
126 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000127
Nils Diewald392bcf32015-02-26 20:01:17 +0000128 // payloads can have different meaning
129 analyzed = true;
130 return this;
Nils Diewald138e5b92014-01-10 21:15:13 +0000131 };
132
Nils Diewaldbb33da22015-03-04 16:24:25 +0000133
Nils Diewald138e5b92014-01-10 21:15:13 +0000134 public String getType () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000135 return this.type;
Nils Diewald138e5b92014-01-10 21:15:13 +0000136 };
137
Nils Diewaldbb33da22015-03-04 16:24:25 +0000138
Nils Diewald138e5b92014-01-10 21:15:13 +0000139 public int getStartChar () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000140 return this.startChar;
Nils Diewald138e5b92014-01-10 21:15:13 +0000141 };
142
Nils Diewaldbb33da22015-03-04 16:24:25 +0000143
Nils Diewald138e5b92014-01-10 21:15:13 +0000144 public void setStartChar (int pos) {
145 this.startChar = pos;
146 };
147
Nils Diewaldbb33da22015-03-04 16:24:25 +0000148
Nils Diewald138e5b92014-01-10 21:15:13 +0000149 public int getEndChar () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000150 return this.endChar;
Nils Diewald138e5b92014-01-10 21:15:13 +0000151 };
152
Nils Diewaldbb33da22015-03-04 16:24:25 +0000153
Nils Diewald138e5b92014-01-10 21:15:13 +0000154 public void setEndChar (int pos) {
155 this.endChar = pos;
156 };
157
Nils Diewaldbb33da22015-03-04 16:24:25 +0000158
Nils Diewald138e5b92014-01-10 21:15:13 +0000159 public int getStartPos () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000160 return this.startPos;
Nils Diewald138e5b92014-01-10 21:15:13 +0000161 };
162
Nils Diewaldbb33da22015-03-04 16:24:25 +0000163
Nils Diewald138e5b92014-01-10 21:15:13 +0000164 public int getEndPos () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000165 return this.endPos;
Nils Diewald138e5b92014-01-10 21:15:13 +0000166 };
167
Nils Diewaldbb33da22015-03-04 16:24:25 +0000168
Nils Diewald138e5b92014-01-10 21:15:13 +0000169 public byte getDepth () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000170 return this.depth;
Nils Diewald138e5b92014-01-10 21:15:13 +0000171 };
172
Nils Diewaldbb33da22015-03-04 16:24:25 +0000173
Nils Diewald138e5b92014-01-10 21:15:13 +0000174 public String getFoundry () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000175 return this.foundry;
Nils Diewald138e5b92014-01-10 21:15:13 +0000176 };
177
Nils Diewaldbb33da22015-03-04 16:24:25 +0000178
Nils Diewald138e5b92014-01-10 21:15:13 +0000179 public String getLayer () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000180 return this.layer;
Nils Diewald138e5b92014-01-10 21:15:13 +0000181 };
182
Nils Diewaldbb33da22015-03-04 16:24:25 +0000183
Nils Diewald138e5b92014-01-10 21:15:13 +0000184 public String getValue () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000185 return this.value;
Nils Diewald138e5b92014-01-10 21:15:13 +0000186 };
187
Nils Diewaldbb33da22015-03-04 16:24:25 +0000188
Nils Diewaldcde69082014-01-16 15:46:48 +0000189 public String getAnnotation () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000190 return this.annotation;
Nils Diewaldcde69082014-01-16 15:46:48 +0000191 };
192
Nils Diewaldbb33da22015-03-04 16:24:25 +0000193
Nils Diewald345bdc02014-01-21 21:48:57 +0000194 public String toString () {
Nils Diewald392bcf32015-02-26 20:01:17 +0000195 this.analyze();
Nils Diewald345bdc02014-01-21 21:48:57 +0000196
Nils Diewald392bcf32015-02-26 20:01:17 +0000197 StringBuffer sb = new StringBuffer();
198 sb.append('<').append(this.getType()).append('>');
199 sb.append(this.getFoundry()).append('/').append(this.getLayer());
Nils Diewald345bdc02014-01-21 21:48:57 +0000200
Nils Diewald392bcf32015-02-26 20:01:17 +0000201 if (this.getValue() != null)
202 sb.append(':').append(this.getValue());
Nils Diewald345bdc02014-01-21 21:48:57 +0000203
Nils Diewald392bcf32015-02-26 20:01:17 +0000204 if (this.getDepth() != (byte) 0)
205 sb.append('(').append(this.getDepth()).append(')');
Nils Diewald345bdc02014-01-21 21:48:57 +0000206
Nils Diewald392bcf32015-02-26 20:01:17 +0000207 sb.append('[').append(this.getStartPos());
208 sb.append('-').append(this.getEndPos()).append(']');
209 sb.append('[').append(this.getStartChar());
210 sb.append('-').append(this.getEndChar()).append(']');
Nils Diewald345bdc02014-01-21 21:48:57 +0000211
Nils Diewald392bcf32015-02-26 20:01:17 +0000212 return sb.toString();
Nils Diewald345bdc02014-01-21 21:48:57 +0000213 };
214
Nils Diewaldbb33da22015-03-04 16:24:25 +0000215
Nils Diewald138e5b92014-01-10 21:15:13 +0000216 @Override
217 public int compareTo (TermInfo obj) {
Nils Diewald392bcf32015-02-26 20:01:17 +0000218 this.analyze();
219 obj.analyze();
Nils Diewald138e5b92014-01-10 21:15:13 +0000220
Nils Diewald392bcf32015-02-26 20:01:17 +0000221 // TODO: This sorting does not seem to work!
222 // although it might only be important for depth stuff.
Nils Diewaldcde69082014-01-16 15:46:48 +0000223
Nils Diewald392bcf32015-02-26 20:01:17 +0000224 if (this.startChar < obj.startChar) {
225 return -1;
226 }
227 else if (this.startChar > obj.startChar) {
228 return 1;
229 }
230 else if (this.depth < obj.depth) {
231 return 1;
232 }
233 else if (this.depth > obj.depth) {
234 return -1;
235 };
236 return 0;
Nils Diewald2cd1c3d2014-01-08 22:53:08 +0000237 };
238};