blob: 68ba827aec3ab45529f8776e0dc94d99d77aa6a3 [file] [log] [blame]
Nils Diewalde4986d72015-02-27 17:35:00 +00001package de.ids_mannheim.korap.index;
Nils Diewaldf399a672013-11-18 17:55:22 +00002
Nils Diewald5c375702015-02-09 20:58:24 +00003import de.ids_mannheim.korap.util.CorpusDataException;
4
Nils Diewaldf399a672013-11-18 17:55:22 +00005import java.util.*;
6
Nils Diewald5c375702015-02-09 20:58:24 +00007import org.slf4j.Logger;
8import org.slf4j.LoggerFactory;
9
10
Nils Diewaldf399a672013-11-18 17:55:22 +000011
12/**
Nils Diewaldbb33da22015-03-04 16:24:25 +000013 *
Nils Diewaldcb8afb32015-02-04 21:12:37 +000014 * A MultiTermToken represents a set of {@link MultiTerm MultiTerms}
15 * starting at the same position, i.e. represents a segment
16 * in a {@link MultiTermTokenStream}.
Nils Diewaldbb33da22015-03-04 16:24:25 +000017 *
Nils Diewaldcb8afb32015-02-04 21:12:37 +000018 * <blockquote><pre>
Nils Diewaldbb33da22015-03-04 16:24:25 +000019 * MultiTermToken mtt = new MultiTermToken("t:test", "a:abbruch");
20 * mtt.add("b:banane");
21 * System.err.println(mtt.toString());
22 * // [t:test|a:abbruch|b:banane]
Nils Diewaldcb8afb32015-02-04 21:12:37 +000023 * </pre></blockquote>
Nils Diewaldbb33da22015-03-04 16:24:25 +000024 *
Nils Diewaldcb8afb32015-02-04 21:12:37 +000025 * @author diewald
Nils Diewaldf399a672013-11-18 17:55:22 +000026 */
27public class MultiTermToken {
Nils Diewaldf399a672013-11-18 17:55:22 +000028 public List<MultiTerm> terms;
Nils Diewaldfe6a3652015-02-05 20:34:27 +000029 private short i = 0;
Nils Diewalddd46b342015-02-04 22:38:29 +000030 private boolean sorted = false;
Nils Diewaldcb8afb32015-02-04 21:12:37 +000031
Nils Diewald5c375702015-02-09 20:58:24 +000032 // This advices the java compiler to ignore all loggings
33 public static final boolean DEBUG = false;
Nils Diewaldbb33da22015-03-04 16:24:25 +000034 private final Logger log = LoggerFactory
35 .getLogger(MultiTermTokenStream.class);
36
Nils Diewald5c375702015-02-09 20:58:24 +000037
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000038 /**
Nils Diewaldcb8afb32015-02-04 21:12:37 +000039 * Construct a new MultiTermToken by passing a stream of
40 * {@link MultiTerm MultiTerms}.
Nils Diewaldbb33da22015-03-04 16:24:25 +000041 *
42 * @param terms
43 * Take at least one {@link MultiTerm} object for a
44 * token.
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000045 */
Nils Diewaldcb8afb32015-02-04 21:12:37 +000046 public MultiTermToken (MultiTerm terms, MultiTerm ... moreTerms) {
47 this.terms = new ArrayList<MultiTerm>(16);
Nils Diewaldbb33da22015-03-04 16:24:25 +000048
49 this.terms.add(terms);
Nils Diewaldf399a672013-11-18 17:55:22 +000050
Nils Diewaldcb8afb32015-02-04 21:12:37 +000051 // Further elements on same position
52 for (i = 0; i < moreTerms.length; i++) {
Nils Diewaldcb8afb32015-02-04 21:12:37 +000053 this.terms.add(moreTerms[i]);
54 };
Nils Diewaldf399a672013-11-18 17:55:22 +000055 };
56
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000057
58 /**
Nils Diewaldcb8afb32015-02-04 21:12:37 +000059 * Construct a new MultiTermToken by passing a {@link MultiTerm}
60 * represented as a prefixed string.
Nils Diewaldbb33da22015-03-04 16:24:25 +000061 *
62 * @param prefix
63 * The term prefix.
64 * @param surface
65 * The term surface.
Nils Diewaldcb8afb32015-02-04 21:12:37 +000066 * @see MultiTerm
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000067 */
Nils Diewaldf399a672013-11-18 17:55:22 +000068 public MultiTermToken (char prefix, String surface) {
Nils Diewaldcb8afb32015-02-04 21:12:37 +000069 this.terms = new ArrayList<MultiTerm>(16);
Nils Diewaldf399a672013-11-18 17:55:22 +000070
Nils Diewaldcb8afb32015-02-04 21:12:37 +000071 // Create a new MultiTerm
Nils Diewald5c375702015-02-09 20:58:24 +000072 try {
73 MultiTerm term = new MultiTerm(prefix, surface);
Nils Diewaldf399a672013-11-18 17:55:22 +000074
Nils Diewald5c375702015-02-09 20:58:24 +000075 // First word element
Nils Diewaldbb33da22015-03-04 16:24:25 +000076 terms.add(term);
Nils Diewald5c375702015-02-09 20:58:24 +000077 }
78 catch (CorpusDataException cde) {
79 log.error("{}: {}", cde.getErrorCode(), cde.getMessage());
80 };
Nils Diewaldf399a672013-11-18 17:55:22 +000081 };
Nils Diewaldbb33da22015-03-04 16:24:25 +000082
Nils Diewaldf399a672013-11-18 17:55:22 +000083
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000084 /**
Nils Diewaldcb8afb32015-02-04 21:12:37 +000085 * Construct a new MultiTermToken by passing a stream of
86 * {@link MultiTerm MultiTerms} represented as strings.
Nils Diewaldbb33da22015-03-04 16:24:25 +000087 *
88 * @param terms
89 * Take at least one {@link MultiTerm} string for a
90 * token.
Nils Diewaldd0d6feb2014-02-26 18:51:08 +000091 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000092 public MultiTermToken (String terms, String ... moreTerms)
93 throws CorpusDataException {
Nils Diewaldcb8afb32015-02-04 21:12:37 +000094 this.terms = new ArrayList<MultiTerm>(16);
Nils Diewaldf399a672013-11-18 17:55:22 +000095
Nils Diewaldcb8afb32015-02-04 21:12:37 +000096 MultiTerm term = new MultiTerm(terms);
Nils Diewaldf399a672013-11-18 17:55:22 +000097
Nils Diewald5c375702015-02-09 20:58:24 +000098 try {
99 // First word element
Nils Diewaldbb33da22015-03-04 16:24:25 +0000100 this.terms.add(term);
Nils Diewaldf399a672013-11-18 17:55:22 +0000101
Nils Diewald5c375702015-02-09 20:58:24 +0000102 // Further elements on same position
103 for (i = 0; i < moreTerms.length; i++) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000104 term = new MultiTerm(moreTerms[i]);
Nils Diewald5c375702015-02-09 20:58:24 +0000105 this.terms.add(term);
106 };
107 }
108 catch (CorpusDataException cde) {
109 log.error("{}: {}", cde.getErrorCode(), cde.getMessage());
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000110 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000111 };
112
Nils Diewaldbb33da22015-03-04 16:24:25 +0000113
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000114 /**
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000115 * Add a new {@link MultiTerm} to the MultiTermToken.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000116 *
117 * @param term
118 * A {@link MultiTerm} object.
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000119 * @return The {@link MultiTermToken} object for chaining.
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000120 */
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000121 public MultiTermToken add (MultiTerm term) {
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000122 terms.add(term);
Nils Diewalddd46b342015-02-04 22:38:29 +0000123 this.sorted = false;
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000124 return this;
Nils Diewaldf399a672013-11-18 17:55:22 +0000125 };
126
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000127
128 /**
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000129 * Add a new {@link MultiTerm} to the MultiTermToken.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000130 *
131 * @param term
132 * A MultiTerm represented as a surface string.
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000133 * @return The {@link MultiTermToken} object for chaining.
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000134 */
Nils Diewald5c375702015-02-09 20:58:24 +0000135 public MultiTermToken add (String term) throws CorpusDataException {
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000136 if (term.length() == 0)
137 return this;
Nils Diewald5c375702015-02-09 20:58:24 +0000138
139 try {
140 this.add(new MultiTerm(term));
141 }
142 catch (CorpusDataException cde) {
143 log.error("{}: {}", cde.getErrorCode(), cde.getMessage());
144 };
145
146 return this;
Nils Diewaldf399a672013-11-18 17:55:22 +0000147 };
148
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000149
150 /**
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000151 * Add a new {@link MultiTerm} to the MultiTermToken.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000152 *
153 * @param prefix
154 * A MultiTerm prefix.
155 * @param term
156 * A MultiTerm represented as a surface string.
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000157 * @return The {@link MultiTermToken} object for chaining.
158 */
159 public MultiTermToken add (char prefix, String term) {
160 if (term.length() == 0)
161 return this;
Nils Diewald5c375702015-02-09 20:58:24 +0000162
163 try {
164 this.add(new MultiTerm(prefix, term));
165 }
166 catch (CorpusDataException cde) {
167 log.error("{}: {}", cde.getErrorCode(), cde.getMessage());
168 };
169
170 return this;
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000171 };
172
173
174 /**
Nils Diewalddd46b342015-02-04 22:38:29 +0000175 * Get a {@link MultiTerm} by index.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000176 *
177 * @param index
178 * The index position of a {@link MultiTerm} in the
179 * {@link MultiTermToken}.
Nils Diewalddd46b342015-02-04 22:38:29 +0000180 * @return A {@link MultiTerm}.
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000181 */
Nils Diewalddd46b342015-02-04 22:38:29 +0000182 public MultiTerm get (int index) {
183 return this.sort().terms.get(index);
Nils Diewaldf399a672013-11-18 17:55:22 +0000184 };
185
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000186
187 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000188 * Get the number of {@link MultiTerm MultiTerms} in the
189 * MultiTermToken.
190 *
191 * @return The number of {@link MultiTerm MultiTerms} in the
192 * MultiTermToken.
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000193 */
194 public int getSize () {
195 return this.terms.size();
196 };
197
198
Nils Diewalddd46b342015-02-04 22:38:29 +0000199 /**
200 * Sort the {@link MultiTerm MultiTerms} in the correct order.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000201 *
Nils Diewalddd46b342015-02-04 22:38:29 +0000202 * @return The {@link MultiTermToken} object for chaining.
203 */
204 public MultiTermToken sort () {
205 if (this.sorted)
206 return this;
207
208 Collections.sort(this.terms);
209 this.sorted = true;
210 return this;
211 };
212
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000213
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000214 /**
215 * Serialize the MultiTermToken to a string.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000216 *
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000217 * @return A string representation of the MultiTermToken,
218 * with leading offset information.
Nils Diewaldd0d6feb2014-02-26 18:51:08 +0000219 */
Nils Diewaldf399a672013-11-18 17:55:22 +0000220 public String toString () {
Nils Diewalddd46b342015-02-04 22:38:29 +0000221 this.sort();
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000222 StringBuffer sb = new StringBuffer();
223 sb.append('[');
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000224 for (i = 0; i < this.terms.size() - 1; i++) {
225 sb.append(this.terms.get(i).toString()).append('|');
226 };
227 sb.append(this.terms.get(i).toString()).append(']');
Nils Diewaldbb33da22015-03-04 16:24:25 +0000228
Nils Diewaldcb8afb32015-02-04 21:12:37 +0000229 return sb.toString();
Nils Diewaldf399a672013-11-18 17:55:22 +0000230 };
231};