| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.index; |
| 2 | |
| 3 | import java.util.*; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 4 | import java.nio.ByteBuffer; |
| Nils Diewald | cde6908 | 2014-01-16 15:46:48 +0000 | [diff] [blame] | 5 | import java.lang.StringBuffer; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 6 | import java.util.regex.*; |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 7 | import de.ids_mannheim.korap.response.Match; |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 8 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 9 | import org.slf4j.Logger; |
| 10 | import org.slf4j.LoggerFactory; |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 11 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 12 | public class TermInfo implements Comparable<TermInfo> { |
| 13 | |
| 14 | // Logger |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 15 | private final static Logger log = LoggerFactory.getLogger(Match.class); |
| Nils Diewald | 82a4b86 | 2014-02-20 21:17:41 +0000 | [diff] [blame] | 16 | // This advices the java compiler to ignore all loggings |
| 17 | public static final boolean DEBUG = false; |
| 18 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 19 | |
| Nils Diewald | cde6908 | 2014-01-16 15:46:48 +0000 | [diff] [blame] | 20 | private String foundry, layer, value, term, type, annotation; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 21 | // type can be "term", "pos", "span", "rel-src", "rel-target" |
| 22 | |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 23 | private int pos = 0; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 24 | private ByteBuffer payload; |
| 25 | private boolean analyzed = false; |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 26 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 27 | private int startChar = -1, endChar = -1, startPos = -1, endPos = -1; |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 28 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 29 | private byte depth = (byte) 0; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 30 | |
| 31 | private Pattern prefixRegex = Pattern |
| 32 | .compile("(?:([^/]+)/)?([^:/]+)(?::(.+?))?"); |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 33 | private Matcher matcher; |
| 34 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 35 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 36 | public TermInfo (String term, int pos, ByteBuffer payload) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 37 | this.term = term; |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 38 | this.startPos = pos; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 39 | this.endPos = pos; |
| 40 | this.payload = payload; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 41 | }; |
| 42 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 43 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 44 | public TermInfo analyze () { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 45 | if (analyzed) |
| 46 | return this; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 47 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 48 | int ttype = 0; |
| 49 | String tterm = this.term; |
| 50 | int lastPos = this.payload.position(); |
| 51 | this.payload.rewind(); |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 52 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 53 | switch (tterm.charAt(0)) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 54 | case '<': |
| 55 | // "<>:mate/l:..." |
| 56 | if (tterm.charAt(1) == '>') { |
| 57 | // span |
| 58 | this.type = "span"; |
| 59 | tterm = tterm.substring(3); |
| 60 | ttype = 2; |
| 61 | } |
| 62 | // rel-target |
| 63 | else { |
| 64 | this.type = "relTarget"; |
| 65 | tterm = tterm.substring(2); |
| 66 | ttype = 3; |
| 67 | } |
| 68 | ; |
| 69 | break; |
| 70 | |
| 71 | case '>': |
| 72 | // rel-src |
| 73 | this.type = "relSrc"; |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 74 | tterm = tterm.substring(2); |
| 75 | ttype = 3; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 76 | break; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 77 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 78 | case '_': |
| 79 | // pos |
| 80 | this.type = "pos"; |
| 81 | ttype = 1; |
| 82 | tterm = tterm.substring(1); |
| 83 | break; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 84 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 85 | default: |
| 86 | // term |
| 87 | this.type = "term"; |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 88 | }; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 89 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 90 | // Analyze term value |
| 91 | if (ttype != 1) { |
| 92 | if (DEBUG) |
| 93 | log.trace("Check {} for {}", tterm, prefixRegex.toString()); |
| 94 | matcher = prefixRegex.matcher(tterm); |
| 95 | if (matcher.matches() && matcher.groupCount() == 3) { |
| 96 | this.annotation = tterm; |
| 97 | if (matcher.group(1) != null) |
| 98 | this.foundry = matcher.group(1); |
| 99 | else |
| 100 | this.foundry = "base"; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 101 | this.layer = matcher.group(2); |
| 102 | this.value = matcher.group(3); |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 103 | }; |
| 104 | } |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 105 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 106 | // for positions |
| 107 | else { |
| 108 | this.value = tterm; |
| 109 | this.startChar = this.payload.getInt(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 110 | this.endChar = this.payload.getInt(); |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 111 | }; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 112 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 113 | // for spans |
| 114 | if (ttype == 2) { |
| 115 | this.startChar = this.payload.getInt(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 116 | this.endChar = this.payload.getInt(); |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 117 | }; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 118 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 119 | // for spans and relations |
| 120 | if (ttype > 1) |
| 121 | // Unsure if this is correct |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 122 | this.endPos = this.payload.getInt() - 1; |
| 123 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 124 | if (ttype == 2 && this.payload.position() < lastPos) { |
| 125 | this.depth = this.payload.get(); |
| 126 | }; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 127 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 128 | // payloads can have different meaning |
| 129 | analyzed = true; |
| 130 | return this; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 131 | }; |
| 132 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 133 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 134 | public String getType () { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 135 | return this.type; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 136 | }; |
| 137 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 138 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 139 | public int getStartChar () { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 140 | return this.startChar; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 141 | }; |
| 142 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 143 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 144 | public void setStartChar (int pos) { |
| 145 | this.startChar = pos; |
| 146 | }; |
| 147 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 148 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 149 | public int getEndChar () { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 150 | return this.endChar; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 151 | }; |
| 152 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 153 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 154 | public void setEndChar (int pos) { |
| 155 | this.endChar = pos; |
| 156 | }; |
| 157 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 158 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 159 | public int getStartPos () { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 160 | return this.startPos; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 161 | }; |
| 162 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 163 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 164 | public int getEndPos () { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 165 | return this.endPos; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 166 | }; |
| 167 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 168 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 169 | public byte getDepth () { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 170 | return this.depth; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 171 | }; |
| 172 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 173 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 174 | public String getFoundry () { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 175 | return this.foundry; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 176 | }; |
| 177 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 178 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 179 | public String getLayer () { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 180 | return this.layer; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 181 | }; |
| 182 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 183 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 184 | public String getValue () { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 185 | return this.value; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 186 | }; |
| 187 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 188 | |
| Nils Diewald | cde6908 | 2014-01-16 15:46:48 +0000 | [diff] [blame] | 189 | public String getAnnotation () { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 190 | return this.annotation; |
| Nils Diewald | cde6908 | 2014-01-16 15:46:48 +0000 | [diff] [blame] | 191 | }; |
| 192 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 193 | |
| Nils Diewald | 345bdc0 | 2014-01-21 21:48:57 +0000 | [diff] [blame] | 194 | public String toString () { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 195 | this.analyze(); |
| Nils Diewald | 345bdc0 | 2014-01-21 21:48:57 +0000 | [diff] [blame] | 196 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 197 | StringBuffer sb = new StringBuffer(); |
| 198 | sb.append('<').append(this.getType()).append('>'); |
| 199 | sb.append(this.getFoundry()).append('/').append(this.getLayer()); |
| Nils Diewald | 345bdc0 | 2014-01-21 21:48:57 +0000 | [diff] [blame] | 200 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 201 | if (this.getValue() != null) |
| 202 | sb.append(':').append(this.getValue()); |
| Nils Diewald | 345bdc0 | 2014-01-21 21:48:57 +0000 | [diff] [blame] | 203 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 204 | if (this.getDepth() != (byte) 0) |
| 205 | sb.append('(').append(this.getDepth()).append(')'); |
| Nils Diewald | 345bdc0 | 2014-01-21 21:48:57 +0000 | [diff] [blame] | 206 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 207 | sb.append('[').append(this.getStartPos()); |
| 208 | sb.append('-').append(this.getEndPos()).append(']'); |
| 209 | sb.append('[').append(this.getStartChar()); |
| 210 | sb.append('-').append(this.getEndChar()).append(']'); |
| Nils Diewald | 345bdc0 | 2014-01-21 21:48:57 +0000 | [diff] [blame] | 211 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 212 | return sb.toString(); |
| Nils Diewald | 345bdc0 | 2014-01-21 21:48:57 +0000 | [diff] [blame] | 213 | }; |
| 214 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 215 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 216 | @Override |
| 217 | public int compareTo (TermInfo obj) { |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 218 | this.analyze(); |
| 219 | obj.analyze(); |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 220 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 221 | // TODO: This sorting does not seem to work! |
| 222 | // although it might only be important for depth stuff. |
| Nils Diewald | cde6908 | 2014-01-16 15:46:48 +0000 | [diff] [blame] | 223 | |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 224 | if (this.startChar < obj.startChar) { |
| 225 | return -1; |
| 226 | } |
| 227 | else if (this.startChar > obj.startChar) { |
| 228 | return 1; |
| 229 | } |
| 230 | else if (this.depth < obj.depth) { |
| 231 | return 1; |
| 232 | } |
| 233 | else if (this.depth > obj.depth) { |
| 234 | return -1; |
| 235 | }; |
| 236 | return 0; |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 237 | }; |
| 238 | }; |