| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 1 | package de.ids_mannheim.korap.response; |
| 2 | |
| 3 | import org.slf4j.Logger; |
| 4 | import org.slf4j.LoggerFactory; |
| 5 | |
| 6 | import com.fasterxml.jackson.annotation.*; |
| 7 | import com.fasterxml.jackson.annotation.JsonInclude.Include; |
| 8 | import com.fasterxml.jackson.databind.ObjectMapper; |
| 9 | import com.fasterxml.jackson.databind.JsonNode; |
| 10 | import com.fasterxml.jackson.databind.node.ObjectNode; |
| 11 | import com.fasterxml.jackson.databind.node.ArrayNode; |
| 12 | |
| 13 | import de.ids_mannheim.korap.index.AbstractDocument; |
| 14 | import de.ids_mannheim.korap.util.KrillDate; |
| 15 | |
| 16 | import java.io.IOException; |
| 17 | |
| 18 | import de.ids_mannheim.korap.index.KeywordAnalyzer; |
| 19 | import org.apache.lucene.analysis.TokenStream; |
| 20 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| 21 | |
| 22 | import java.io.StringReader; |
| 23 | |
| 24 | import java.util.*; |
| 25 | import java.util.regex.*; |
| 26 | |
| 27 | import org.apache.lucene.index.*; |
| Akron | c7a2abc | 2019-01-17 14:21:34 +0100 | [diff] [blame] | 28 | import org.apache.lucene.document.FieldType; |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 29 | |
| Akron | 50e5f61 | 2019-01-16 12:52:39 +0100 | [diff] [blame] | 30 | public class MetaFieldsObj implements Iterable<MetaField> { |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 31 | |
| 32 | // Logger |
| 33 | private final static Logger log = LoggerFactory.getLogger(MetaFields.class); |
| 34 | |
| 35 | // This advices the java compiler to ignore all loggings |
| 36 | public static final boolean DEBUG = false; |
| 37 | |
| Akron | c7a2abc | 2019-01-17 14:21:34 +0100 | [diff] [blame] | 38 | private static final Pattern dateValuePattern = Pattern.compile("^([0-9]{8})$"); |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 39 | |
| 40 | // Mapper for JSON serialization |
| 41 | ObjectMapper mapper = new ObjectMapper(); |
| 42 | |
| Akron | 1a8bb76 | 2019-01-18 15:48:59 +0100 | [diff] [blame] | 43 | public List<String> fieldsOrder; |
| 44 | |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 45 | private Map<String, MetaField> fieldsMap = new HashMap<>(); |
| 46 | |
| 47 | |
| Akron | 50e5f61 | 2019-01-16 12:52:39 +0100 | [diff] [blame] | 48 | public MetaFieldsObj () {}; |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 49 | |
| 50 | |
| 51 | /** |
| 52 | * Add field to collection |
| 53 | */ |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 54 | public MetaField add (IndexableField iField) { |
| Akron | 4376e74 | 2019-01-16 15:02:30 +0100 | [diff] [blame] | 55 | return this.add( |
| 56 | metaFieldFromIndexableField( |
| 57 | iField, |
| Akron | 1a8bb76 | 2019-01-18 15:48:59 +0100 | [diff] [blame] | 58 | new MetaField(iField.name(), "type:string") |
| Akron | 4376e74 | 2019-01-16 15:02:30 +0100 | [diff] [blame] | 59 | ) |
| 60 | ); |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 61 | }; |
| 62 | |
| 63 | |
| 64 | /** |
| 65 | * Add field to collection |
| 66 | */ |
| 67 | public MetaField add (MetaField mf) { |
| 68 | // Ignore non-stored fields |
| 69 | if (mf == null) |
| 70 | return null; |
| 71 | |
| Akron | de4f085 | 2019-01-16 16:29:44 +0100 | [diff] [blame] | 72 | if (fieldsMap.containsKey(mf.key)) { |
| 73 | fieldsMap.get(mf.key).values.addAll(mf.values); |
| 74 | return fieldsMap.get(mf.key); |
| 75 | }; |
| 76 | |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 77 | fieldsMap.put(mf.key, mf); |
| 78 | return mf; |
| 79 | }; |
| 80 | |
| Akron | 1a8bb76 | 2019-01-18 15:48:59 +0100 | [diff] [blame] | 81 | |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 82 | // Field type needs to be restored heuristically |
| 83 | // - though that's not very elegant |
| Akron | 4376e74 | 2019-01-16 15:02:30 +0100 | [diff] [blame] | 84 | public static MetaField metaFieldFromIndexableField (IndexableField iField, MetaField mf) { |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 85 | IndexableFieldType iFieldType = iField.fieldType(); |
| 86 | |
| 87 | // Field type needs to be restored heuristically |
| 88 | // - though that's not very elegant |
| 89 | |
| 90 | // Ignore non-stored fields |
| 91 | if (!iFieldType.stored()) |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 92 | return null; |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 93 | |
| 94 | // TODO: Check if metaField exists for that field |
| 95 | |
| 96 | Number n = iField.numericValue(); |
| 97 | String s = iField.stringValue(); |
| 98 | |
| 99 | // Field has numeric value (possibly a date) |
| 100 | if (n != null) { |
| 101 | |
| 102 | // Check if key indicates a date |
| Akron | c7a2abc | 2019-01-17 14:21:34 +0100 | [diff] [blame] | 103 | Matcher dateMatcher = dateValuePattern.matcher(n.toString()); |
| 104 | if (dateMatcher.matches()) { |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 105 | mf.type = "type:date"; |
| Akron | c7a2abc | 2019-01-17 14:21:34 +0100 | [diff] [blame] | 106 | |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 107 | KrillDate date = new KrillDate(n.toString()); |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 108 | if (date != null) { |
| 109 | |
| 110 | // Serialize withz dash separation |
| 111 | mf.values.add(date.toDisplay()); |
| 112 | }; |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 113 | } |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 114 | |
| 115 | // Field is a number |
| 116 | else { |
| Akron | 50e5f61 | 2019-01-16 12:52:39 +0100 | [diff] [blame] | 117 | mf.type = "type:integer"; |
| Akron | e96f9fe | 2023-08-28 11:36:12 +0200 | [diff] [blame] | 118 | mf.values.add(Integer.valueOf(n.intValue()).toString()); |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 119 | }; |
| 120 | } |
| 121 | |
| 122 | // Field has a textual value |
| 123 | else if (s != null) { |
| 124 | |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 125 | // Stored |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 126 | if (iFieldType.indexOptions() == IndexOptions.NONE) { |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 127 | |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 128 | String value = s.toString(); |
| 129 | if (value.startsWith("data:")) { |
| 130 | mf.type = "type:attachement"; |
| 131 | } |
| 132 | else { |
| 133 | mf.type = "type:store"; |
| 134 | }; |
| 135 | mf.values.add(value); |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 136 | return mf; |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 137 | } |
| 138 | |
| 139 | // Keywords |
| 140 | else if (iFieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS) { |
| 141 | mf.type = "type:keywords"; |
| 142 | |
| 143 | // Analyze keywords |
| 144 | try { |
| 145 | StringReader reader = new StringReader(s.toString()); |
| 146 | KeywordAnalyzer kwa = new KeywordAnalyzer(); |
| 147 | TokenStream ts = kwa.tokenStream("-", reader); |
| 148 | CharTermAttribute term; |
| 149 | ts.reset(); |
| 150 | while (ts.incrementToken()) { |
| 151 | term = ts.getAttribute(CharTermAttribute.class); |
| 152 | mf.values.add(term.toString()); |
| 153 | }; |
| 154 | ts.close(); |
| 155 | reader.close(); |
| 156 | } |
| 157 | catch (IOException e) { |
| 158 | log.error("Unable to split {}={}", iField.name(), s.toString()); |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | // Text |
| 163 | else if (iFieldType.indexOptions() != IndexOptions.DOCS) { |
| 164 | mf.type = "type:text"; |
| 165 | mf.values.add(s.toString()); |
| 166 | } |
| 167 | |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 168 | // Special treatment for legacy indices |
| 169 | else if (mf.key.equals("UID")) { |
| 170 | mf.type = "type:integer"; |
| 171 | mf.values.add(s.toString()); |
| 172 | } |
| 173 | |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 174 | // String |
| 175 | else { |
| 176 | mf.values.add(s.toString()); |
| 177 | }; |
| 178 | } |
| 179 | |
| 180 | else { |
| 181 | log.error("Unknown field type {}", iField.name()); |
| 182 | }; |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 183 | |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 184 | mf.values.removeAll(Collections.singleton(null)); |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 185 | |
| Akron | 2b921a6 | 2019-01-14 18:52:45 +0100 | [diff] [blame] | 186 | return mf; |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 187 | }; |
| 188 | |
| 189 | |
| 190 | /** |
| 191 | * Get field from collection |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 192 | * |
| 193 | * @param key |
| 194 | * The key of the field |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 195 | */ |
| 196 | public MetaField get (String key) { |
| 197 | return fieldsMap.get(key); |
| 198 | }; |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 199 | |
| 200 | |
| 201 | /** |
| 202 | * Check for field existence. |
| 203 | * |
| 204 | * @param key |
| 205 | * The key of the field |
| 206 | */ |
| 207 | public Boolean contains (String key) { |
| 208 | return fieldsMap.containsKey(key); |
| 209 | }; |
| 210 | |
| Akron | be9638d | 2019-02-07 17:09:42 +0100 | [diff] [blame] | 211 | |
| Akron | 1a8bb76 | 2019-01-18 15:48:59 +0100 | [diff] [blame] | 212 | private Iterator<String> getIterator () { |
| 213 | if (this.fieldsOrder == null) { |
| 214 | return fieldsMap.keySet().iterator(); |
| 215 | }; |
| 216 | return this.fieldsOrder.iterator(); |
| 217 | }; |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 218 | |
| 219 | |
| 220 | @Override |
| 221 | public Iterator<MetaField> iterator() { |
| 222 | return new Iterator<MetaField>() { |
| 223 | |
| Akron | 1a8bb76 | 2019-01-18 15:48:59 +0100 | [diff] [blame] | 224 | private Iterator<String> it = getIterator(); |
| 225 | |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 226 | private int currentIndex = 0; |
| 227 | |
| 228 | @Override |
| 229 | public boolean hasNext() { |
| 230 | return it.hasNext(); |
| 231 | }; |
| 232 | |
| 233 | @Override |
| 234 | public MetaField next() { |
| Akron | 1a8bb76 | 2019-01-18 15:48:59 +0100 | [diff] [blame] | 235 | String key = it.next(); |
| 236 | MetaField mf = fieldsMap.get(key); |
| 237 | if (mf == null) |
| 238 | return new MetaField(key); |
| 239 | return mf; |
| Akron | 32b9519 | 2019-01-11 13:58:55 +0100 | [diff] [blame] | 240 | }; |
| 241 | |
| 242 | @Override |
| 243 | public void remove() { |
| 244 | throw new UnsupportedOperationException(); |
| 245 | }; |
| 246 | }; |
| 247 | }; |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 248 | }; |