| Nils Diewald | e4986d7 | 2015-02-27 17:35:00 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.index; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 2 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 3 | import de.ids_mannheim.korap.util.CorpusDataException; |
| 4 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 5 | import java.util.*; |
| 6 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 7 | import org.slf4j.Logger; |
| 8 | import org.slf4j.LoggerFactory; |
| 9 | |
| 10 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 11 | |
| 12 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 13 | * |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 14 | * A MultiTermToken represents a set of {@link MultiTerm MultiTerms} |
| 15 | * starting at the same position, i.e. represents a segment |
| 16 | * in a {@link MultiTermTokenStream}. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 17 | * |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 18 | * <blockquote><pre> |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 19 | * MultiTermToken mtt = new MultiTermToken("t:test", "a:abbruch"); |
| 20 | * mtt.add("b:banane"); |
| 21 | * System.err.println(mtt.toString()); |
| 22 | * // [t:test|a:abbruch|b:banane] |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 23 | * </pre></blockquote> |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 24 | * |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 25 | * @author diewald |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 26 | */ |
| 27 | public class MultiTermToken { |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 28 | public List<MultiTerm> terms; |
| Nils Diewald | fe6a365 | 2015-02-05 20:34:27 +0000 | [diff] [blame] | 29 | private short i = 0; |
| Nils Diewald | dd46b34 | 2015-02-04 22:38:29 +0000 | [diff] [blame] | 30 | private boolean sorted = false; |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 31 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 32 | // This advices the java compiler to ignore all loggings |
| 33 | public static final boolean DEBUG = false; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 34 | private final Logger log = LoggerFactory |
| 35 | .getLogger(MultiTermTokenStream.class); |
| 36 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 37 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 38 | /** |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 39 | * Construct a new MultiTermToken by passing a stream of |
| 40 | * {@link MultiTerm MultiTerms}. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 41 | * |
| 42 | * @param terms |
| 43 | * Take at least one {@link MultiTerm} object for a |
| 44 | * token. |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 45 | */ |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 46 | public MultiTermToken (MultiTerm terms, MultiTerm ... moreTerms) { |
| 47 | this.terms = new ArrayList<MultiTerm>(16); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 48 | |
| 49 | this.terms.add(terms); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 50 | |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 51 | // Further elements on same position |
| 52 | for (i = 0; i < moreTerms.length; i++) { |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 53 | this.terms.add(moreTerms[i]); |
| 54 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 55 | }; |
| 56 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 57 | |
| 58 | /** |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 59 | * Construct a new MultiTermToken by passing a {@link MultiTerm} |
| 60 | * represented as a prefixed string. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 61 | * |
| 62 | * @param prefix |
| 63 | * The term prefix. |
| 64 | * @param surface |
| 65 | * The term surface. |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 66 | * @see MultiTerm |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 67 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 68 | public MultiTermToken (char prefix, String surface) { |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 69 | this.terms = new ArrayList<MultiTerm>(16); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 70 | |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 71 | // Create a new MultiTerm |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 72 | try { |
| 73 | MultiTerm term = new MultiTerm(prefix, surface); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 74 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 75 | // First word element |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 76 | terms.add(term); |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 77 | } |
| 78 | catch (CorpusDataException cde) { |
| 79 | log.error("{}: {}", cde.getErrorCode(), cde.getMessage()); |
| 80 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 81 | }; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 82 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 83 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 84 | /** |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 85 | * Construct a new MultiTermToken by passing a stream of |
| 86 | * {@link MultiTerm MultiTerms} represented as strings. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 87 | * |
| 88 | * @param terms |
| 89 | * Take at least one {@link MultiTerm} string for a |
| 90 | * token. |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 91 | */ |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 92 | public MultiTermToken (String terms, String ... moreTerms) |
| 93 | throws CorpusDataException { |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 94 | this.terms = new ArrayList<MultiTerm>(16); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 95 | |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 96 | MultiTerm term = new MultiTerm(terms); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 97 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 98 | try { |
| 99 | // First word element |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 100 | this.terms.add(term); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 101 | |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 102 | // Further elements on same position |
| 103 | for (i = 0; i < moreTerms.length; i++) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 104 | term = new MultiTerm(moreTerms[i]); |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 105 | this.terms.add(term); |
| 106 | }; |
| 107 | } |
| 108 | catch (CorpusDataException cde) { |
| 109 | log.error("{}: {}", cde.getErrorCode(), cde.getMessage()); |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 110 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 111 | }; |
| 112 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 113 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 114 | /** |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 115 | * Add a new {@link MultiTerm} to the MultiTermToken. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 116 | * |
| 117 | * @param term |
| 118 | * A {@link MultiTerm} object. |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 119 | * @return The {@link MultiTermToken} object for chaining. |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 120 | */ |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 121 | public MultiTermToken add (MultiTerm term) { |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 122 | terms.add(term); |
| Nils Diewald | dd46b34 | 2015-02-04 22:38:29 +0000 | [diff] [blame] | 123 | this.sorted = false; |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 124 | return this; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 125 | }; |
| 126 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 127 | |
| 128 | /** |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 129 | * Add a new {@link MultiTerm} to the MultiTermToken. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 130 | * |
| 131 | * @param term |
| 132 | * A MultiTerm represented as a surface string. |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 133 | * @return The {@link MultiTermToken} object for chaining. |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 134 | */ |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 135 | public MultiTermToken add (String term) throws CorpusDataException { |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 136 | if (term.length() == 0) |
| 137 | return this; |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 138 | |
| 139 | try { |
| 140 | this.add(new MultiTerm(term)); |
| 141 | } |
| 142 | catch (CorpusDataException cde) { |
| 143 | log.error("{}: {}", cde.getErrorCode(), cde.getMessage()); |
| 144 | }; |
| 145 | |
| 146 | return this; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 147 | }; |
| 148 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 149 | |
| 150 | /** |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 151 | * Add a new {@link MultiTerm} to the MultiTermToken. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 152 | * |
| 153 | * @param prefix |
| 154 | * A MultiTerm prefix. |
| 155 | * @param term |
| 156 | * A MultiTerm represented as a surface string. |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 157 | * @return The {@link MultiTermToken} object for chaining. |
| 158 | */ |
| 159 | public MultiTermToken add (char prefix, String term) { |
| 160 | if (term.length() == 0) |
| 161 | return this; |
| Nils Diewald | 5c37570 | 2015-02-09 20:58:24 +0000 | [diff] [blame] | 162 | |
| 163 | try { |
| 164 | this.add(new MultiTerm(prefix, term)); |
| 165 | } |
| 166 | catch (CorpusDataException cde) { |
| 167 | log.error("{}: {}", cde.getErrorCode(), cde.getMessage()); |
| 168 | }; |
| 169 | |
| 170 | return this; |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 171 | }; |
| 172 | |
| 173 | |
| 174 | /** |
| Nils Diewald | dd46b34 | 2015-02-04 22:38:29 +0000 | [diff] [blame] | 175 | * Get a {@link MultiTerm} by index. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 176 | * |
| 177 | * @param index |
| 178 | * The index position of a {@link MultiTerm} in the |
| 179 | * {@link MultiTermToken}. |
| Nils Diewald | dd46b34 | 2015-02-04 22:38:29 +0000 | [diff] [blame] | 180 | * @return A {@link MultiTerm}. |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 181 | */ |
| Nils Diewald | dd46b34 | 2015-02-04 22:38:29 +0000 | [diff] [blame] | 182 | public MultiTerm get (int index) { |
| 183 | return this.sort().terms.get(index); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 184 | }; |
| 185 | |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 186 | |
| 187 | /** |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 188 | * Get the number of {@link MultiTerm MultiTerms} in the |
| 189 | * MultiTermToken. |
| 190 | * |
| 191 | * @return The number of {@link MultiTerm MultiTerms} in the |
| 192 | * MultiTermToken. |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 193 | */ |
| 194 | public int getSize () { |
| 195 | return this.terms.size(); |
| 196 | }; |
| 197 | |
| 198 | |
| Nils Diewald | dd46b34 | 2015-02-04 22:38:29 +0000 | [diff] [blame] | 199 | /** |
| 200 | * Sort the {@link MultiTerm MultiTerms} in the correct order. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 201 | * |
| Nils Diewald | dd46b34 | 2015-02-04 22:38:29 +0000 | [diff] [blame] | 202 | * @return The {@link MultiTermToken} object for chaining. |
| 203 | */ |
| 204 | public MultiTermToken sort () { |
| 205 | if (this.sorted) |
| 206 | return this; |
| 207 | |
| 208 | Collections.sort(this.terms); |
| 209 | this.sorted = true; |
| 210 | return this; |
| 211 | }; |
| 212 | |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 213 | |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 214 | /** |
| 215 | * Serialize the MultiTermToken to a string. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 216 | * |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 217 | * @return A string representation of the MultiTermToken, |
| 218 | * with leading offset information. |
| Nils Diewald | d0d6feb | 2014-02-26 18:51:08 +0000 | [diff] [blame] | 219 | */ |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 220 | public String toString () { |
| Nils Diewald | dd46b34 | 2015-02-04 22:38:29 +0000 | [diff] [blame] | 221 | this.sort(); |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 222 | StringBuffer sb = new StringBuffer(); |
| 223 | sb.append('['); |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 224 | for (i = 0; i < this.terms.size() - 1; i++) { |
| 225 | sb.append(this.terms.get(i).toString()).append('|'); |
| 226 | }; |
| 227 | sb.append(this.terms.get(i).toString()).append(']'); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 228 | |
| Nils Diewald | cb8afb3 | 2015-02-04 21:12:37 +0000 | [diff] [blame] | 229 | return sb.toString(); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 230 | }; |
| 231 | }; |