| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.analysis; |
| 2 | |
| 3 | import de.ids_mannheim.korap.analysis.MultiTerm; |
| 4 | import de.ids_mannheim.korap.analysis.MultiTermToken; |
| 5 | import static de.ids_mannheim.korap.util.KorapByte.*; |
| 6 | import org.apache.lucene.util.BytesRef; |
| 7 | |
| 8 | import java.util.*; |
| 9 | import java.util.regex.*; |
| 10 | |
| 11 | import org.apache.lucene.analysis.TokenStream; |
| 12 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 13 | import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| 14 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| 15 | |
| 16 | import org.slf4j.Logger; |
| 17 | import org.slf4j.LoggerFactory; |
| 18 | |
| Nils Diewald | b5b7b8d | 2014-06-06 18:41:54 +0000 | [diff] [blame] | 19 | import java.io.Reader; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 20 | import java.io.IOException; |
| 21 | |
| 22 | /* |
| 23 | Todo: |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 24 | - !Payload is [4ByteStartOffset][14BitEndOffset-startOffset][1BitBooleanIfSpan][1BitBooleanIfOpen] |
| 25 | - Payload is [4ByteOffsetStart][4ByteOffsetStart] |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 26 | */ |
| 27 | |
| 28 | /** |
| 29 | * @author Nils Diewald |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 30 | * @version 0.3 |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 31 | * |
| 32 | * MultiTermTokenStream extends Lucenes TokenStream class to work with MultiTermTokens. |
| 33 | * |
| 34 | * @see org.apache.lucene.analysis.TokenStream |
| 35 | */ |
| 36 | public class MultiTermTokenStream extends TokenStream { |
| 37 | private CharTermAttribute charTermAttr; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 38 | private PositionIncrementAttribute posIncrAttr; |
| 39 | private PayloadAttribute payloadAttr; |
| 40 | |
| Nils Diewald | 67f5404 | 2014-09-27 14:53:38 +0000 | [diff] [blame] | 41 | |
| 42 | /* |
| 43 | TODO: Update to new Tokeanstream API |
| 44 | http://www.hankcs.com/program/java/lucene-4-6-1-java-lang-illegalstateexception-tokenstream-contract-violation.html |
| 45 | */ |
| 46 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 47 | private static final Pattern pattern = Pattern.compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]"); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 48 | |
| Nils Diewald | 82a4b86 | 2014-02-20 21:17:41 +0000 | [diff] [blame] | 49 | // This advices the java compiler to ignore all loggings |
| 50 | public static final boolean DEBUG = false; |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 51 | private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class); |
| Nils Diewald | 82a4b86 | 2014-02-20 21:17:41 +0000 | [diff] [blame] | 52 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 53 | private List<MultiTermToken> multiTermTokens; |
| 54 | private int mttIndex = 0, mtIndex = 0; |
| 55 | private static short i = 0; |
| 56 | |
| 57 | |
| 58 | /** |
| 59 | * The empty Constructor. |
| 60 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 61 | public MultiTermTokenStream () { |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 62 | this.charTermAttr = this.addAttribute(CharTermAttribute.class); |
| 63 | this.posIncrAttr = this.addAttribute(PositionIncrementAttribute.class); |
| 64 | this.payloadAttr = this.addAttribute(PayloadAttribute.class); |
| 65 | this.multiTermTokens = new ArrayList<MultiTermToken>(100); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 66 | }; |
| 67 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 68 | |
| 69 | /** |
| 70 | * The Constructor. |
| 71 | * |
| 72 | * @param stream The MultiTermTokenStream as a string representation. |
| 73 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 74 | public MultiTermTokenStream (String stream) { |
| 75 | this(); |
| Nils Diewald | b5b7b8d | 2014-06-06 18:41:54 +0000 | [diff] [blame] | 76 | this._fromString(stream); |
| 77 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 78 | |
| Nils Diewald | b5b7b8d | 2014-06-06 18:41:54 +0000 | [diff] [blame] | 79 | /** |
| 80 | * The Constructor. |
| 81 | * |
| 82 | * @param stream The MultiTermTokenStream as a reader object. |
| 83 | */ |
| 84 | public MultiTermTokenStream (Reader stream) throws IOException { |
| 85 | this(); |
| 86 | |
| 87 | StringBuilder sb = new StringBuilder(4096); |
| 88 | char[] buf = new char[128]; |
| 89 | int i; |
| 90 | while ((i = stream.read(buf)) > 0) { |
| 91 | sb.append(buf, 0, i); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 92 | }; |
| Nils Diewald | b5b7b8d | 2014-06-06 18:41:54 +0000 | [diff] [blame] | 93 | this._fromString(sb.toString()); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 94 | }; |
| 95 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 96 | |
| 97 | /** |
| 98 | * Add a MultiTermToken to the end of the MultiTermTokenStream. |
| 99 | * |
| 100 | * @param mtt A MultiTermToken. |
| 101 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 102 | public void addMultiTermToken (MultiTermToken mtt) { |
| 103 | this.multiTermTokens.add(mtt); |
| 104 | }; |
| 105 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 106 | |
| 107 | /** |
| 108 | * Add a MultiTermToken by means of MultiTerms to the end of |
| 109 | * the MultiTermTokenStream. |
| 110 | * |
| 111 | * @param term At least one MultiTerm. |
| 112 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 113 | public void addMultiTermToken (MultiTerm term, MultiTerm ... moreTerms) { |
| 114 | this.addMultiTermToken(new MultiTermToken(term, moreTerms)); |
| 115 | }; |
| 116 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 117 | |
| 118 | /** |
| 119 | * Add a MultiTermToken by means of a single MultiTerm to the end of |
| 120 | * the MultiTermTokenStream. |
| 121 | * |
| 122 | * @param prefix A prefix character of a surface form of a MultiTerm. |
| 123 | * @param surface A surface string of a MultiTerm. |
| 124 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 125 | public void addMultiTermToken (char prefix, String surface) { |
| 126 | this.addMultiTermToken(new MultiTermToken(prefix, surface)); |
| 127 | }; |
| 128 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 129 | |
| 130 | /** |
| 131 | * Add a MultiTermToken by means of a a series of surface strings |
| 132 | * to the end of the MultiTermTokenStream. |
| 133 | * |
| 134 | * @param surface At least one surface string of a MultiTerm. |
| 135 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 136 | public void addMultiTermToken (String surface, String ... moreTerms) { |
| 137 | this.addMultiTermToken(new MultiTermToken(surface, moreTerms)); |
| 138 | }; |
| 139 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 140 | |
| 141 | /** |
| 142 | * Add meta information to the MultiTermTokenStream. |
| 143 | * |
| 144 | * @param key A string for denoting the meta information. |
| 145 | * @param value The value of the meta key as a string. |
| 146 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 147 | public void addMeta (String key, String value) { |
| 148 | MultiTerm mt = new MultiTerm('-', key); |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 149 | mt.setPayload(value); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 150 | this.multiTermTokens.get(0).add(mt); |
| 151 | }; |
| 152 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 153 | |
| 154 | /** |
| 155 | * Add meta information to the MultiTermTokenStream. |
| 156 | * |
| 157 | * @param key A string for denoting the meta information. |
| 158 | * @param value The value of the meta key as a byte array. |
| 159 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 160 | public void addMeta (String key, byte[] value) { |
| 161 | MultiTerm mt = new MultiTerm('-', key); |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 162 | mt.setPayload(value); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 163 | this.multiTermTokens.get(0).add(mt); |
| 164 | }; |
| 165 | |
| 166 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 167 | /** |
| 168 | * Add meta information to the MultiTermTokenStream. |
| 169 | * |
| 170 | * @param key A string for denoting the meta information. |
| 171 | * @param value The value of the meta key as a short value. |
| 172 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 173 | public void addMeta (String key, short value) { |
| 174 | MultiTerm mt = new MultiTerm('-', key); |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 175 | mt.setPayload(value); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 176 | this.multiTermTokens.get(0).add(mt); |
| 177 | }; |
| 178 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 179 | |
| 180 | /** |
| 181 | * Add meta information to the MultiTermTokenStream. |
| 182 | * |
| 183 | * @param key A string for denoting the meta information. |
| 184 | * @param value The value of the meta key as a long value. |
| 185 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 186 | public void addMeta (String key, long value) { |
| 187 | MultiTerm mt = new MultiTerm('-', key); |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 188 | mt.setPayload(value); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 189 | this.multiTermTokens.get(0).add(mt); |
| 190 | }; |
| 191 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 192 | |
| 193 | /** |
| 194 | * Add meta information to the MultiTermTokenStream. |
| 195 | * |
| 196 | * @param key A string for denoting the meta information. |
| 197 | * @param value The value of the meta key as a integer value. |
| 198 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 199 | public void addMeta (String key, int value) { |
| 200 | MultiTerm mt = new MultiTerm('-', key); |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 201 | mt.setPayload(value); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 202 | this.multiTermTokens.get(0).add(mt); |
| 203 | }; |
| 204 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 205 | |
| 206 | /** |
| 207 | * Increment the token in the MultiTermTokenStream. |
| 208 | * This overrides the function in Lucene's TokenStream. |
| 209 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 210 | @Override |
| 211 | public final boolean incrementToken() throws IOException { |
| 212 | this.payloadAttr.setPayload(null); |
| 213 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 214 | // Last token reached |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 215 | if (this.multiTermTokens.size() == this.mttIndex) { |
| 216 | reset(); |
| 217 | return false; |
| 218 | }; |
| 219 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 220 | // Get current token |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 221 | MultiTermToken mtt = this.multiTermTokens.get( this.mttIndex ); |
| 222 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 223 | // Last term reached |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 224 | if (mtt.terms.size() == this.mtIndex) { |
| 225 | this.mtIndex = 0; |
| 226 | this.mttIndex++; |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 227 | |
| 228 | // Last term of last token reached |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 229 | if (this.multiTermTokens.size() == this.mttIndex) { |
| 230 | reset(); |
| 231 | return false; |
| 232 | } |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 233 | |
| 234 | // Get last token |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 235 | else { |
| 236 | mtt = this.multiTermTokens.get( this.mttIndex ); |
| 237 | }; |
| 238 | }; |
| 239 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 240 | // Get current term |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 241 | MultiTerm mt = mtt.terms.get(this.mtIndex); |
| 242 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 243 | // Set the relative position to the former term |
| 244 | posIncrAttr.setPositionIncrement( mt.posIncr ); |
| 245 | charTermAttr.setEmpty(); |
| 246 | charTermAttr.append( mt.term ); |
| 247 | |
| 248 | BytesRef payload = new BytesRef(); |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 249 | |
| 250 | // There is offset information |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 251 | if (mt.start != mt.end) { |
| Nils Diewald | 82a4b86 | 2014-02-20 21:17:41 +0000 | [diff] [blame] | 252 | if (DEBUG) |
| 253 | log.trace("MultiTerm with payload offset: {}-{}", mt.start, mt.end); |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 254 | |
| 255 | // Add offsets to BytesRef payload |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 256 | payload.append(new BytesRef(int2byte(mt.start))); |
| 257 | payload.append(new BytesRef(int2byte(mt.end))); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 258 | }; |
| 259 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 260 | // There is payload in the MultiTerm |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 261 | if (mt.payload != null) { |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 262 | payload.append(mt.payload); |
| Nils Diewald | 82a4b86 | 2014-02-20 21:17:41 +0000 | [diff] [blame] | 263 | if (DEBUG) |
| 264 | log.trace("Create payload[1] {}", payload.toString()); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 265 | }; |
| 266 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 267 | // There is payload in the current token to index |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 268 | if (payload.length > 0) { |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 269 | payloadAttr.setPayload(payload); |
| Nils Diewald | 82a4b86 | 2014-02-20 21:17:41 +0000 | [diff] [blame] | 270 | if (DEBUG) |
| 271 | log.trace("Set payload[2] {}", payload.toString()); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 272 | }; |
| 273 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 274 | if (DEBUG) { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 275 | StringBuilder sb = new StringBuilder("Index: ["); |
| 276 | sb.append(mt.term); |
| 277 | if (payload.length > 0) |
| 278 | sb.append('$').append(payload.toString()); |
| 279 | sb.append(']'); |
| 280 | sb.append(" with increment ").append(mt.posIncr); |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 281 | |
| 282 | log.trace(sb.toString()); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 283 | }; |
| 284 | |
| 285 | this.mtIndex++; |
| 286 | |
| 287 | return true; |
| 288 | }; |
| 289 | |
| 290 | public String toString () { |
| 291 | StringBuffer sb = new StringBuffer(); |
| 292 | for (MultiTermToken mtt : this.multiTermTokens) { |
| 293 | sb.append( mtt.toString() ); |
| 294 | }; |
| 295 | return sb.toString(); |
| 296 | }; |
| 297 | |
| Nils Diewald | b5b7b8d | 2014-06-06 18:41:54 +0000 | [diff] [blame] | 298 | private void _fromString (String stream) { |
| 299 | Matcher matcher = pattern.matcher(stream); |
| 300 | |
| 301 | while (matcher.find()) { |
| 302 | String[] seg = matcher.group(1).split("\\|"); |
| 303 | MultiTermToken mtt = new MultiTermToken( seg[0] ); |
| 304 | |
| 305 | for (i = 1; i < seg.length; i++) |
| 306 | mtt.add(seg[i]); |
| 307 | |
| 308 | this.addMultiTermToken(mtt); |
| 309 | }; |
| 310 | }; |
| 311 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 312 | @Override |
| 313 | public void reset() { |
| 314 | this.mttIndex = 0; |
| 315 | this.mtIndex = 0; |
| 316 | }; |
| 317 | }; |