| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.index; |
| 2 | |
| 3 | import java.util.*; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 4 | import java.nio.ByteBuffer; |
| Nils Diewald | cde6908 | 2014-01-16 15:46:48 +0000 | [diff] [blame] | 5 | import java.lang.StringBuffer; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 6 | import java.util.regex.*; |
| 7 | import de.ids_mannheim.korap.KorapMatch; |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 8 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 9 | import org.slf4j.Logger; |
| 10 | import org.slf4j.LoggerFactory; |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 11 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 12 | public class TermInfo implements Comparable<TermInfo> { |
| 13 | |
| 14 | // Logger |
| 15 | private final static Logger log = LoggerFactory.getLogger(KorapMatch.class); |
| Nils Diewald | 82a4b86 | 2014-02-20 21:17:41 +0000 | [diff] [blame] | 16 | // This advices the java compiler to ignore all loggings |
| 17 | public static final boolean DEBUG = false; |
| 18 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 19 | |
| Nils Diewald | cde6908 | 2014-01-16 15:46:48 +0000 | [diff] [blame] | 20 | private String foundry, layer, value, term, type, annotation; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 21 | // type can be "term", "pos", "span", "rel-src", "rel-target" |
| 22 | |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 23 | private int pos = 0; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 24 | private ByteBuffer payload; |
| 25 | private boolean analyzed = false; |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 26 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 27 | private int startChar = -1, |
| 28 | endChar = -1, |
| 29 | startPos = -1, |
| 30 | endPos = -1; |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 31 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 32 | private byte depth = (byte) 0; |
| 33 | |
| Nils Diewald | 345bdc0 | 2014-01-21 21:48:57 +0000 | [diff] [blame] | 34 | private Pattern prefixRegex = Pattern.compile("(?:([^/]+)/)?([^:/]+)(?::(.+?))?"); |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 35 | private Matcher matcher; |
| 36 | |
| 37 | public TermInfo (String term, int pos, ByteBuffer payload) { |
| 38 | this.term = term; |
| 39 | this.startPos = pos; |
| Nils Diewald | cde6908 | 2014-01-16 15:46:48 +0000 | [diff] [blame] | 40 | this.endPos = pos; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 41 | this.payload = payload; |
| 42 | }; |
| 43 | |
| 44 | public TermInfo analyze () { |
| 45 | if (analyzed) |
| 46 | return this; |
| 47 | |
| 48 | int ttype = 0; |
| 49 | String tterm = this.term; |
| Nils Diewald | 345bdc0 | 2014-01-21 21:48:57 +0000 | [diff] [blame] | 50 | int lastPos = this.payload.position(); |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 51 | this.payload.rewind(); |
| 52 | |
| 53 | switch (tterm.charAt(0)) { |
| 54 | case '<': |
| 55 | // "<>:mate/l:..." |
| 56 | if (tterm.charAt(1) == '>') { |
| 57 | // span |
| 58 | this.type = "span"; |
| 59 | tterm = tterm.substring(3); |
| 60 | ttype = 2; |
| 61 | } |
| 62 | // rel-target |
| 63 | else { |
| 64 | this.type = "relTarget"; |
| 65 | tterm = tterm.substring(2); |
| 66 | ttype = 3; |
| 67 | }; |
| 68 | break; |
| 69 | case '>': |
| 70 | // rel-src |
| 71 | this.type = "relSrc"; |
| 72 | tterm = tterm.substring(2); |
| 73 | ttype = 3; |
| 74 | break; |
| 75 | |
| 76 | case '_': |
| 77 | // pos |
| 78 | this.type = "pos"; |
| 79 | ttype = 1; |
| 80 | tterm = tterm.substring(1); |
| 81 | break; |
| 82 | default: |
| 83 | // term |
| 84 | this.type = "term"; |
| 85 | }; |
| 86 | |
| 87 | // Analyze term value |
| 88 | if (ttype != 1) { |
| Nils Diewald | 82a4b86 | 2014-02-20 21:17:41 +0000 | [diff] [blame] | 89 | if (DEBUG) |
| 90 | log.trace("Check {} for {}", tterm, prefixRegex.toString()); |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 91 | matcher = prefixRegex.matcher(tterm); |
| 92 | if (matcher.matches() && matcher.groupCount() == 3) { |
| Nils Diewald | cde6908 | 2014-01-16 15:46:48 +0000 | [diff] [blame] | 93 | this.annotation = tterm; |
| Nils Diewald | 345bdc0 | 2014-01-21 21:48:57 +0000 | [diff] [blame] | 94 | if (matcher.group(1) != null) |
| 95 | this.foundry = matcher.group(1); |
| 96 | else |
| 97 | this.foundry = "base"; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 98 | this.layer = matcher.group(2); |
| 99 | this.value = matcher.group(3); |
| 100 | }; |
| 101 | } |
| 102 | |
| 103 | // for positions |
| 104 | else { |
| 105 | this.value = tterm; |
| 106 | this.startChar = this.payload.getInt(); |
| 107 | this.endChar = this.payload.getInt(); |
| 108 | }; |
| 109 | |
| 110 | // for spans |
| 111 | if (ttype == 2) { |
| 112 | this.startChar = this.payload.getInt(); |
| 113 | this.endChar = this.payload.getInt(); |
| 114 | }; |
| 115 | |
| 116 | // for spans and relations |
| 117 | if (ttype > 1) |
| Nils Diewald | cde6908 | 2014-01-16 15:46:48 +0000 | [diff] [blame] | 118 | // Unsure if this is correct |
| 119 | this.endPos = this.payload.getInt() -1; |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 120 | |
| Nils Diewald | 345bdc0 | 2014-01-21 21:48:57 +0000 | [diff] [blame] | 121 | if (ttype == 2 && this.payload.position() < lastPos) { |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 122 | this.depth = this.payload.get(); |
| 123 | }; |
| 124 | |
| 125 | // payloads can have different meaning |
| 126 | analyzed = true; |
| 127 | return this; |
| 128 | }; |
| 129 | |
| 130 | public String getType () { |
| 131 | return this.type; |
| 132 | }; |
| 133 | |
| 134 | public int getStartChar () { |
| 135 | return this.startChar; |
| 136 | }; |
| 137 | |
| 138 | public void setStartChar (int pos) { |
| 139 | this.startChar = pos; |
| 140 | }; |
| 141 | |
| 142 | public int getEndChar () { |
| 143 | return this.endChar; |
| 144 | }; |
| 145 | |
| 146 | public void setEndChar (int pos) { |
| 147 | this.endChar = pos; |
| 148 | }; |
| 149 | |
| 150 | public int getStartPos () { |
| 151 | return this.startPos; |
| 152 | }; |
| 153 | |
| 154 | public int getEndPos () { |
| 155 | return this.endPos; |
| 156 | }; |
| 157 | |
| 158 | public byte getDepth () { |
| 159 | return this.depth; |
| 160 | }; |
| 161 | |
| 162 | public String getFoundry () { |
| 163 | return this.foundry; |
| 164 | }; |
| 165 | |
| 166 | public String getLayer () { |
| 167 | return this.layer; |
| 168 | }; |
| 169 | |
| 170 | public String getValue () { |
| 171 | return this.value; |
| 172 | }; |
| 173 | |
| Nils Diewald | cde6908 | 2014-01-16 15:46:48 +0000 | [diff] [blame] | 174 | public String getAnnotation () { |
| 175 | return this.annotation; |
| 176 | }; |
| 177 | |
| Nils Diewald | 345bdc0 | 2014-01-21 21:48:57 +0000 | [diff] [blame] | 178 | public String toString () { |
| 179 | this.analyze(); |
| 180 | |
| 181 | StringBuffer sb = new StringBuffer(); |
| 182 | sb.append('<').append(this.getType()).append('>'); |
| 183 | sb.append(this.getFoundry()).append('/').append(this.getLayer()); |
| 184 | |
| 185 | if (this.getValue() != null) |
| 186 | sb.append(':').append(this.getValue()); |
| 187 | |
| 188 | if (this.getDepth() != (byte) 0) |
| 189 | sb.append('(').append(this.getDepth()).append(')'); |
| 190 | |
| 191 | sb.append('[').append(this.getStartPos()); |
| 192 | sb.append('-').append(this.getEndPos()).append(']'); |
| 193 | sb.append('[').append(this.getStartChar()); |
| 194 | sb.append('-').append(this.getEndChar()).append(']'); |
| 195 | |
| 196 | return sb.toString(); |
| 197 | }; |
| 198 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 199 | @Override |
| 200 | public int compareTo (TermInfo obj) { |
| 201 | this.analyze(); |
| 202 | obj.analyze(); |
| 203 | |
| Nils Diewald | cde6908 | 2014-01-16 15:46:48 +0000 | [diff] [blame] | 204 | // TODO: This sorting does not seem to work! |
| 205 | // although it might only be important for depth stuff. |
| 206 | |
| Nils Diewald | 138e5b9 | 2014-01-10 21:15:13 +0000 | [diff] [blame] | 207 | if (this.startChar < obj.startChar) { |
| 208 | return -1; |
| 209 | } |
| 210 | else if (this.startChar > obj.startChar) { |
| 211 | return 1; |
| 212 | } |
| 213 | else if (this.depth < obj.depth) { |
| 214 | return 1; |
| 215 | } |
| 216 | else if (this.depth > obj.depth) { |
| 217 | return -1; |
| 218 | }; |
| 219 | return 0; |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 220 | }; |
| 221 | }; |