blob: c869c0a13821ecfa976af5bcf8024bfc3abb9d4d [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap.analysis;
2
3import de.ids_mannheim.korap.analysis.MultiTerm;
4import de.ids_mannheim.korap.analysis.MultiTermToken;
5import static de.ids_mannheim.korap.util.KorapByte.*;
6import org.apache.lucene.util.BytesRef;
7
8import java.util.*;
9import java.util.regex.*;
10
11import org.apache.lucene.analysis.TokenStream;
12import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
Nils Diewaldf399a672013-11-18 17:55:22 +000013import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
14import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
15
16import org.slf4j.Logger;
17import org.slf4j.LoggerFactory;
18
Nils Diewaldb5b7b8d2014-06-06 18:41:54 +000019import java.io.Reader;
Nils Diewaldf399a672013-11-18 17:55:22 +000020import java.io.IOException;
21
22/*
23 Todo:
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000024 - !Payload is [4ByteStartOffset][14BitEndOffset-startOffset][1BitBooleanIfSpan][1BitBooleanIfOpen]
25 - Payload is [4ByteOffsetStart][4ByteOffsetStart]
Nils Diewaldf399a672013-11-18 17:55:22 +000026*/
27
28/**
29 * @author Nils Diewald
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000030 * @version 0.3
Nils Diewaldf399a672013-11-18 17:55:22 +000031 *
32 * MultiTermTokenStream extends Lucenes TokenStream class to work with MultiTermTokens.
33 *
34 * @see org.apache.lucene.analysis.TokenStream
35 */
36public class MultiTermTokenStream extends TokenStream {
37 private CharTermAttribute charTermAttr;
Nils Diewaldf399a672013-11-18 17:55:22 +000038 private PositionIncrementAttribute posIncrAttr;
39 private PayloadAttribute payloadAttr;
40
Nils Diewald67f54042014-09-27 14:53:38 +000041
42 /*
43 TODO: Update to new Tokeanstream API
44 http://www.hankcs.com/program/java/lucene-4-6-1-java-lang-illegalstateexception-tokenstream-contract-violation.html
45 */
46
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000047 private static final Pattern pattern = Pattern.compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]");
Nils Diewaldf399a672013-11-18 17:55:22 +000048
Nils Diewald82a4b862014-02-20 21:17:41 +000049 // This advices the java compiler to ignore all loggings
50 public static final boolean DEBUG = false;
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000051 private final Logger log = LoggerFactory.getLogger(MultiTermTokenStream.class);
Nils Diewald82a4b862014-02-20 21:17:41 +000052
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000053 private List<MultiTermToken> multiTermTokens;
54 private int mttIndex = 0, mtIndex = 0;
55 private static short i = 0;
56
57
58 /**
59 * The empty Constructor.
60 */
Nils Diewaldf399a672013-11-18 17:55:22 +000061 public MultiTermTokenStream () {
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000062 this.charTermAttr = this.addAttribute(CharTermAttribute.class);
63 this.posIncrAttr = this.addAttribute(PositionIncrementAttribute.class);
64 this.payloadAttr = this.addAttribute(PayloadAttribute.class);
65 this.multiTermTokens = new ArrayList<MultiTermToken>(100);
Nils Diewaldf399a672013-11-18 17:55:22 +000066 };
67
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000068
69 /**
70 * The Constructor.
71 *
72 * @param stream The MultiTermTokenStream as a string representation.
73 */
Nils Diewaldf399a672013-11-18 17:55:22 +000074 public MultiTermTokenStream (String stream) {
75 this();
Nils Diewaldb5b7b8d2014-06-06 18:41:54 +000076 this._fromString(stream);
77 };
Nils Diewaldf399a672013-11-18 17:55:22 +000078
Nils Diewaldb5b7b8d2014-06-06 18:41:54 +000079 /**
80 * The Constructor.
81 *
82 * @param stream The MultiTermTokenStream as a reader object.
83 */
84 public MultiTermTokenStream (Reader stream) throws IOException {
85 this();
86
87 StringBuilder sb = new StringBuilder(4096);
88 char[] buf = new char[128];
89 int i;
90 while ((i = stream.read(buf)) > 0) {
91 sb.append(buf, 0, i);
Nils Diewaldf399a672013-11-18 17:55:22 +000092 };
Nils Diewaldb5b7b8d2014-06-06 18:41:54 +000093 this._fromString(sb.toString());
Nils Diewaldf399a672013-11-18 17:55:22 +000094 };
95
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000096
97 /**
98 * Add a MultiTermToken to the end of the MultiTermTokenStream.
99 *
100 * @param mtt A MultiTermToken.
101 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000102 public void addMultiTermToken (MultiTermToken mtt) {
103 this.multiTermTokens.add(mtt);
104 };
105
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000106
107 /**
108 * Add a MultiTermToken by means of MultiTerms to the end of
109 * the MultiTermTokenStream.
110 *
111 * @param term At least one MultiTerm.
112 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000113 public void addMultiTermToken (MultiTerm term, MultiTerm ... moreTerms) {
114 this.addMultiTermToken(new MultiTermToken(term, moreTerms));
115 };
116
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000117
118 /**
119 * Add a MultiTermToken by means of a single MultiTerm to the end of
120 * the MultiTermTokenStream.
121 *
122 * @param prefix A prefix character of a surface form of a MultiTerm.
123 * @param surface A surface string of a MultiTerm.
124 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000125 public void addMultiTermToken (char prefix, String surface) {
126 this.addMultiTermToken(new MultiTermToken(prefix, surface));
127 };
128
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000129
130 /**
131 * Add a MultiTermToken by means of a a series of surface strings
132 * to the end of the MultiTermTokenStream.
133 *
134 * @param surface At least one surface string of a MultiTerm.
135 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000136 public void addMultiTermToken (String surface, String ... moreTerms) {
137 this.addMultiTermToken(new MultiTermToken(surface, moreTerms));
138 };
139
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000140
141 /**
142 * Add meta information to the MultiTermTokenStream.
143 *
144 * @param key A string for denoting the meta information.
145 * @param value The value of the meta key as a string.
146 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000147 public void addMeta (String key, String value) {
148 MultiTerm mt = new MultiTerm('-', key);
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000149 mt.setPayload(value);
Nils Diewaldf399a672013-11-18 17:55:22 +0000150 this.multiTermTokens.get(0).add(mt);
151 };
152
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000153
154 /**
155 * Add meta information to the MultiTermTokenStream.
156 *
157 * @param key A string for denoting the meta information.
158 * @param value The value of the meta key as a byte array.
159 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000160 public void addMeta (String key, byte[] value) {
161 MultiTerm mt = new MultiTerm('-', key);
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000162 mt.setPayload(value);
Nils Diewaldf399a672013-11-18 17:55:22 +0000163 this.multiTermTokens.get(0).add(mt);
164 };
165
166
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000167 /**
168 * Add meta information to the MultiTermTokenStream.
169 *
170 * @param key A string for denoting the meta information.
171 * @param value The value of the meta key as a short value.
172 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000173 public void addMeta (String key, short value) {
174 MultiTerm mt = new MultiTerm('-', key);
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000175 mt.setPayload(value);
Nils Diewaldf399a672013-11-18 17:55:22 +0000176 this.multiTermTokens.get(0).add(mt);
177 };
178
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000179
180 /**
181 * Add meta information to the MultiTermTokenStream.
182 *
183 * @param key A string for denoting the meta information.
184 * @param value The value of the meta key as a long value.
185 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000186 public void addMeta (String key, long value) {
187 MultiTerm mt = new MultiTerm('-', key);
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000188 mt.setPayload(value);
Nils Diewaldf399a672013-11-18 17:55:22 +0000189 this.multiTermTokens.get(0).add(mt);
190 };
191
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000192
193 /**
194 * Add meta information to the MultiTermTokenStream.
195 *
196 * @param key A string for denoting the meta information.
197 * @param value The value of the meta key as a integer value.
198 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000199 public void addMeta (String key, int value) {
200 MultiTerm mt = new MultiTerm('-', key);
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000201 mt.setPayload(value);
Nils Diewaldf399a672013-11-18 17:55:22 +0000202 this.multiTermTokens.get(0).add(mt);
203 };
204
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000205
206 /**
207 * Increment the token in the MultiTermTokenStream.
208 * This overrides the function in Lucene's TokenStream.
209 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000210 @Override
211 public final boolean incrementToken() throws IOException {
212 this.payloadAttr.setPayload(null);
213
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000214 // Last token reached
Nils Diewaldf399a672013-11-18 17:55:22 +0000215 if (this.multiTermTokens.size() == this.mttIndex) {
216 reset();
217 return false;
218 };
219
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000220 // Get current token
Nils Diewaldf399a672013-11-18 17:55:22 +0000221 MultiTermToken mtt = this.multiTermTokens.get( this.mttIndex );
222
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000223 // Last term reached
Nils Diewaldf399a672013-11-18 17:55:22 +0000224 if (mtt.terms.size() == this.mtIndex) {
225 this.mtIndex = 0;
226 this.mttIndex++;
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000227
228 // Last term of last token reached
Nils Diewaldf399a672013-11-18 17:55:22 +0000229 if (this.multiTermTokens.size() == this.mttIndex) {
230 reset();
231 return false;
232 }
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000233
234 // Get last token
Nils Diewaldf399a672013-11-18 17:55:22 +0000235 else {
236 mtt = this.multiTermTokens.get( this.mttIndex );
237 };
238 };
239
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000240 // Get current term
Nils Diewaldf399a672013-11-18 17:55:22 +0000241 MultiTerm mt = mtt.terms.get(this.mtIndex);
242
Nils Diewaldf399a672013-11-18 17:55:22 +0000243 // Set the relative position to the former term
244 posIncrAttr.setPositionIncrement( mt.posIncr );
245 charTermAttr.setEmpty();
246 charTermAttr.append( mt.term );
247
248 BytesRef payload = new BytesRef();
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000249
250 // There is offset information
Nils Diewaldf399a672013-11-18 17:55:22 +0000251 if (mt.start != mt.end) {
Nils Diewald82a4b862014-02-20 21:17:41 +0000252 if (DEBUG)
253 log.trace("MultiTerm with payload offset: {}-{}", mt.start, mt.end);
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000254
255 // Add offsets to BytesRef payload
Nils Diewaldf399a672013-11-18 17:55:22 +0000256 payload.append(new BytesRef(int2byte(mt.start)));
257 payload.append(new BytesRef(int2byte(mt.end)));
Nils Diewaldf399a672013-11-18 17:55:22 +0000258 };
259
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000260 // There is payload in the MultiTerm
Nils Diewaldf399a672013-11-18 17:55:22 +0000261 if (mt.payload != null) {
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000262 payload.append(mt.payload);
Nils Diewald82a4b862014-02-20 21:17:41 +0000263 if (DEBUG)
264 log.trace("Create payload[1] {}", payload.toString());
Nils Diewaldf399a672013-11-18 17:55:22 +0000265 };
266
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000267 // There is payload in the current token to index
Nils Diewaldf399a672013-11-18 17:55:22 +0000268 if (payload.length > 0) {
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000269 payloadAttr.setPayload(payload);
Nils Diewald82a4b862014-02-20 21:17:41 +0000270 if (DEBUG)
271 log.trace("Set payload[2] {}", payload.toString());
Nils Diewaldf399a672013-11-18 17:55:22 +0000272 };
273
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000274 if (DEBUG) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000275 StringBuilder sb = new StringBuilder("Index: [");
276 sb.append(mt.term);
277 if (payload.length > 0)
278 sb.append('$').append(payload.toString());
279 sb.append(']');
280 sb.append(" with increment ").append(mt.posIncr);
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000281
282 log.trace(sb.toString());
Nils Diewaldf399a672013-11-18 17:55:22 +0000283 };
284
285 this.mtIndex++;
286
287 return true;
288 };
289
290 public String toString () {
291 StringBuffer sb = new StringBuffer();
292 for (MultiTermToken mtt : this.multiTermTokens) {
293 sb.append( mtt.toString() );
294 };
295 return sb.toString();
296 };
297
Nils Diewaldb5b7b8d2014-06-06 18:41:54 +0000298 private void _fromString (String stream) {
299 Matcher matcher = pattern.matcher(stream);
300
301 while (matcher.find()) {
302 String[] seg = matcher.group(1).split("\\|");
303 MultiTermToken mtt = new MultiTermToken( seg[0] );
304
305 for (i = 1; i < seg.length; i++)
306 mtt.add(seg[i]);
307
308 this.addMultiTermToken(mtt);
309 };
310 };
311
Nils Diewaldf399a672013-11-18 17:55:22 +0000312 @Override
313 public void reset() {
314 this.mttIndex = 0;
315 this.mtIndex = 0;
316 };
317};