| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 1 | package mapper |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 2 | |
| 3 | import ( |
| 4 | "encoding/json" |
| 5 | "fmt" |
| 6 | |
| Akron | 2ef703c | 2025-07-03 15:57:42 +0200 | [diff] [blame] | 7 | "github.com/KorAP/Koral-Mapper/ast" |
| 8 | "github.com/KorAP/Koral-Mapper/matcher" |
| 9 | "github.com/KorAP/Koral-Mapper/parser" |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 10 | ) |
| 11 | |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 12 | // ApplyQueryMappings transforms a JSON query object using the mapping rules |
| 13 | // identified by mappingID. The input may be a bare query node or a wrapper |
| 14 | // object containing a "query" field; both forms are accepted. |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 15 | func (m *Mapper) ApplyQueryMappings(mappingID string, opts MappingOptions, jsonData any) (any, error) { |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 16 | if _, exists := m.mappingLists[mappingID]; !exists { |
| 17 | return nil, fmt.Errorf("mapping list with ID %s not found", mappingID) |
| 18 | } |
| 19 | |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 20 | if m.mappingLists[mappingID].IsCorpus() { |
| 21 | return m.applyCorpusQueryMappings(mappingID, opts, jsonData) |
| 22 | } |
| 23 | |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 24 | rules := m.parsedQueryRules[mappingID] |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 25 | |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 26 | // Detect wrapper: input may be {"query": ...} or a bare koral:token |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 27 | var queryData any |
| 28 | var hasQueryWrapper bool |
| 29 | |
| 30 | if jsonMap, ok := jsonData.(map[string]any); ok { |
| 31 | if query, exists := jsonMap["query"]; exists { |
| 32 | queryData = query |
| 33 | hasQueryWrapper = true |
| 34 | } |
| 35 | } |
| 36 | |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 37 | if !hasQueryWrapper { |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 38 | if !isValidQueryObject(jsonData) { |
| 39 | return jsonData, nil |
| 40 | } |
| 41 | queryData = jsonData |
| 42 | } else if queryData == nil || !isValidQueryObject(queryData) { |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 43 | return jsonData, nil |
| 44 | } |
| 45 | |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 46 | // Strip pre-existing rewrites before AST conversion so they do not |
| 47 | // interfere with matching. They are restored after transformation. |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 48 | var oldRewrites any |
| 49 | if queryMap, ok := queryData.(map[string]any); ok { |
| 50 | if rewrites, exists := queryMap["rewrites"]; exists { |
| 51 | oldRewrites = rewrites |
| 52 | delete(queryMap, "rewrites") |
| 53 | } |
| 54 | } |
| 55 | |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 56 | jsonBytes, err := json.Marshal(queryData) |
| 57 | if err != nil { |
| 58 | return nil, fmt.Errorf("failed to marshal input JSON: %w", err) |
| 59 | } |
| 60 | |
| 61 | node, err := parser.ParseJSON(jsonBytes) |
| 62 | if err != nil { |
| 63 | return nil, fmt.Errorf("failed to parse JSON into AST: %w", err) |
| 64 | } |
| 65 | |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 66 | // Unwrap Token so matching operates on the inner node; re-wrapped later. |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 67 | isToken := false |
| 68 | var tokenWrap ast.Node |
| 69 | if token, ok := node.(*ast.Token); ok { |
| 70 | isToken = true |
| 71 | tokenWrap = token.Wrap |
| 72 | node = tokenWrap |
| 73 | } |
| 74 | |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 75 | // Resolve foundry/layer overrides per direction once, before the rule loop. |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 76 | var patternFoundry, patternLayer, replacementFoundry, replacementLayer string |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 77 | if opts.Direction { |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 78 | patternFoundry, patternLayer = opts.FoundryA, opts.LayerA |
| 79 | replacementFoundry, replacementLayer = opts.FoundryB, opts.LayerB |
| 80 | } else { |
| 81 | patternFoundry, patternLayer = opts.FoundryB, opts.LayerB |
| 82 | replacementFoundry, replacementLayer = opts.FoundryA, opts.LayerA |
| 83 | } |
| 84 | |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 85 | // patternCache avoids redundant Clone+Override for the same rule index |
| 86 | // and foundry/layer combination across repeated calls. |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 87 | type patternCacheKey struct { |
| 88 | ruleIndex int |
| 89 | foundry string |
| 90 | layer string |
| 91 | isReplacement bool |
| 92 | } |
| 93 | patternCache := make(map[patternCacheKey]ast.Node) |
| 94 | |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 95 | for i, rule := range rules { |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 96 | var pattern, replacement ast.Node |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 97 | if opts.Direction { |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 98 | pattern = rule.Upper |
| 99 | replacement = rule.Lower |
| 100 | } else { |
| 101 | pattern = rule.Lower |
| 102 | replacement = rule.Upper |
| 103 | } |
| 104 | |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 105 | if token, ok := pattern.(*ast.Token); ok { |
| 106 | pattern = token.Wrap |
| 107 | } |
| 108 | if token, ok := replacement.(*ast.Token); ok { |
| 109 | replacement = token.Wrap |
| 110 | } |
| 111 | |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 112 | patternKey := patternCacheKey{ruleIndex: i, foundry: patternFoundry, layer: patternLayer, isReplacement: false} |
| 113 | processedPattern, exists := patternCache[patternKey] |
| 114 | if !exists { |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 115 | processedPattern = pattern.Clone() |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 116 | if patternFoundry != "" || patternLayer != "" { |
| 117 | ast.ApplyFoundryAndLayerOverrides(processedPattern, patternFoundry, patternLayer) |
| 118 | } |
| 119 | patternCache[patternKey] = processedPattern |
| 120 | } |
| 121 | |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 122 | // Probe for a match before cloning the replacement (lazy evaluation) |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 123 | tempMatcher, err := matcher.NewMatcher(ast.Pattern{Root: processedPattern}, ast.Replacement{Root: &ast.Term{}}) |
| 124 | if err != nil { |
| 125 | return nil, fmt.Errorf("failed to create temporary matcher: %w", err) |
| 126 | } |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 127 | if !tempMatcher.Match(node) { |
| 128 | continue |
| 129 | } |
| 130 | |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 131 | replacementKey := patternCacheKey{ruleIndex: i, foundry: replacementFoundry, layer: replacementLayer, isReplacement: true} |
| 132 | processedReplacement, exists := patternCache[replacementKey] |
| 133 | if !exists { |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 134 | processedReplacement = replacement.Clone() |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 135 | if replacementFoundry != "" || replacementLayer != "" { |
| 136 | ast.ApplyFoundryAndLayerOverrides(processedReplacement, replacementFoundry, replacementLayer) |
| 137 | } |
| 138 | patternCache[replacementKey] = processedReplacement |
| 139 | } |
| 140 | |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 141 | var beforeNode ast.Node |
| 142 | if opts.AddRewrites { |
| 143 | beforeNode = node.Clone() |
| 144 | } |
| 145 | |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 146 | actualMatcher, err := matcher.NewMatcher(ast.Pattern{Root: processedPattern}, ast.Replacement{Root: processedReplacement}) |
| 147 | if err != nil { |
| 148 | return nil, fmt.Errorf("failed to create matcher: %w", err) |
| 149 | } |
| 150 | node = actualMatcher.Replace(node) |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 151 | |
| 152 | if opts.AddRewrites { |
| 153 | recordRewrites(node, beforeNode) |
| 154 | } |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 155 | } |
| 156 | |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 157 | var result ast.Node |
| 158 | if isToken { |
| 159 | result = &ast.Token{Wrap: node} |
| 160 | } else { |
| 161 | result = node |
| 162 | } |
| 163 | |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 164 | resultBytes, err := parser.SerializeToJSON(result) |
| 165 | if err != nil { |
| 166 | return nil, fmt.Errorf("failed to serialize AST to JSON: %w", err) |
| 167 | } |
| 168 | |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 169 | var resultData any |
| 170 | if err := json.Unmarshal(resultBytes, &resultData); err != nil { |
| 171 | return nil, fmt.Errorf("failed to parse result JSON: %w", err) |
| 172 | } |
| 173 | |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 174 | // Restore pre-existing rewrites. The round-trip through ast.Rewrite |
| 175 | // normalizes legacy field names (e.g. "source" -> "editor") so the |
| 176 | // output always uses the modern schema. |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 177 | if oldRewrites != nil { |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 178 | if rewritesList, ok := oldRewrites.([]any); ok { |
| 179 | processedRewrites := make([]any, len(rewritesList)) |
| 180 | for i, rewriteData := range rewritesList { |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 181 | rewriteBytes, err := json.Marshal(rewriteData) |
| 182 | if err != nil { |
| 183 | return nil, fmt.Errorf("failed to marshal old rewrite %d: %w", i, err) |
| 184 | } |
| 185 | var rewrite ast.Rewrite |
| 186 | if err := json.Unmarshal(rewriteBytes, &rewrite); err != nil { |
| 187 | return nil, fmt.Errorf("failed to unmarshal old rewrite %d: %w", i, err) |
| 188 | } |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 189 | transformedBytes, err := json.Marshal(&rewrite) |
| 190 | if err != nil { |
| 191 | return nil, fmt.Errorf("failed to marshal transformed rewrite %d: %w", i, err) |
| 192 | } |
| 193 | var transformedRewrite any |
| 194 | if err := json.Unmarshal(transformedBytes, &transformedRewrite); err != nil { |
| 195 | return nil, fmt.Errorf("failed to unmarshal transformed rewrite %d: %w", i, err) |
| 196 | } |
| 197 | processedRewrites[i] = transformedRewrite |
| 198 | } |
| 199 | if resultMap, ok := resultData.(map[string]any); ok { |
| 200 | resultMap["rewrites"] = processedRewrites |
| 201 | } |
| 202 | } else { |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 203 | if resultMap, ok := resultData.(map[string]any); ok { |
| 204 | resultMap["rewrites"] = oldRewrites |
| 205 | } |
| 206 | } |
| 207 | } |
| 208 | |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 209 | if hasQueryWrapper { |
| 210 | if wrapper, ok := jsonData.(map[string]any); ok { |
| 211 | wrapper["query"] = resultData |
| 212 | return wrapper, nil |
| 213 | } |
| 214 | } |
| 215 | |
| 216 | return resultData, nil |
| 217 | } |
| 218 | |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 219 | // recordRewrites compares the new node against the before-snapshot and |
| 220 | // attaches rewrite entries to any changed nodes. It handles both simple |
| 221 | // nodes (Term, TermGroup) and container nodes (CatchallNode with operands). |
| 222 | func recordRewrites(newNode, beforeNode ast.Node) { |
| 223 | if ast.NodesEqual(newNode, beforeNode) { |
| 224 | return |
| 225 | } |
| 226 | |
| 227 | // For CatchallNodes with operands (e.g. token sequences), attach |
| 228 | // per-operand rewrites so each changed token gets its own annotation. |
| 229 | if newCatchall, ok := newNode.(*ast.CatchallNode); ok { |
| 230 | if oldCatchall, ok := beforeNode.(*ast.CatchallNode); ok && len(newCatchall.Operands) > 0 { |
| 231 | for i, newOp := range newCatchall.Operands { |
| 232 | if i >= len(oldCatchall.Operands) { |
| 233 | break |
| 234 | } |
| 235 | oldOp := oldCatchall.Operands[i] |
| 236 | recordRewritesForOperand(newOp, oldOp) |
| 237 | } |
| 238 | return |
| 239 | } |
| 240 | } |
| 241 | |
| 242 | addRewriteToNode(newNode, beforeNode) |
| 243 | } |
| 244 | |
| 245 | // recordRewritesForOperand handles rewrite recording for a single operand, |
| 246 | // unwrapping Token nodes so the rewrite attaches to the inner term/termGroup |
| 247 | // rather than the token wrapper. |
| 248 | func recordRewritesForOperand(newOp, oldOp ast.Node) { |
| 249 | if ast.NodesEqual(newOp, oldOp) { |
| 250 | return |
| 251 | } |
| 252 | |
| 253 | newInner := newOp |
| 254 | oldInner := oldOp |
| 255 | if tok, ok := newOp.(*ast.Token); ok { |
| 256 | newInner = tok.Wrap |
| 257 | } |
| 258 | if tok, ok := oldOp.(*ast.Token); ok { |
| 259 | oldInner = tok.Wrap |
| 260 | } |
| 261 | |
| 262 | if newInner == nil || ast.NodesEqual(newInner, oldInner) { |
| 263 | return |
| 264 | } |
| 265 | |
| 266 | addRewriteToNode(newInner, oldInner) |
| 267 | } |
| 268 | |
| Akron | 958fc47 | 2026-05-19 13:58:52 +0200 | [diff] [blame^] | 269 | // addRewriteToNode creates and attaches rewrite entries to a node, |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 270 | // recording what the node looked like before the change. |
| 271 | func addRewriteToNode(newNode, originalNode ast.Node) { |
| Akron | 958fc47 | 2026-05-19 13:58:52 +0200 | [diff] [blame^] | 272 | for _, rw := range buildRewrites(originalNode, newNode) { |
| 273 | ast.AppendRewrite(newNode, rw) |
| 274 | } |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 275 | } |
| 276 | |
| Akron | 958fc47 | 2026-05-19 13:58:52 +0200 | [diff] [blame^] | 277 | // buildRewrites creates Rewrite entries describing what changed between |
| 278 | // originalNode and newNode. For term-level changes it emits one scoped |
| 279 | // rewrite per changed field so the transformation is fully reversible. |
| 280 | // For structural changes it stores the full original as an object. |
| 281 | func buildRewrites(originalNode, newNode ast.Node) []ast.Rewrite { |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 282 | if term, ok := originalNode.(*ast.Term); ok && ast.IsTermNode(newNode) && originalNode.Type() == newNode.Type() { |
| 283 | newTerm := newNode.(*ast.Term) |
| Akron | 958fc47 | 2026-05-19 13:58:52 +0200 | [diff] [blame^] | 284 | var rewrites []ast.Rewrite |
| 285 | |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 286 | if term.Foundry != newTerm.Foundry { |
| Akron | 958fc47 | 2026-05-19 13:58:52 +0200 | [diff] [blame^] | 287 | rw := ast.Rewrite{Editor: RewriteEditor, Scope: "foundry"} |
| 288 | if term.Foundry != "" { |
| 289 | rw.Original = term.Foundry |
| 290 | } |
| 291 | rewrites = append(rewrites, rw) |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 292 | } |
| 293 | if term.Layer != newTerm.Layer { |
| Akron | 958fc47 | 2026-05-19 13:58:52 +0200 | [diff] [blame^] | 294 | rw := ast.Rewrite{Editor: RewriteEditor, Scope: "layer"} |
| 295 | if term.Layer != "" { |
| 296 | rw.Original = term.Layer |
| 297 | } |
| 298 | rewrites = append(rewrites, rw) |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 299 | } |
| 300 | if term.Key != newTerm.Key { |
| Akron | 958fc47 | 2026-05-19 13:58:52 +0200 | [diff] [blame^] | 301 | rw := ast.Rewrite{Editor: RewriteEditor, Scope: "key"} |
| 302 | if term.Key != "" { |
| 303 | rw.Original = term.Key |
| 304 | } |
| 305 | rewrites = append(rewrites, rw) |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 306 | } |
| 307 | if term.Value != newTerm.Value { |
| Akron | 958fc47 | 2026-05-19 13:58:52 +0200 | [diff] [blame^] | 308 | rw := ast.Rewrite{Editor: RewriteEditor, Scope: "value"} |
| 309 | if term.Value != "" { |
| 310 | rw.Original = term.Value |
| 311 | } |
| 312 | rewrites = append(rewrites, rw) |
| 313 | } |
| 314 | |
| 315 | if len(rewrites) > 0 { |
| 316 | return rewrites |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 317 | } |
| 318 | } |
| 319 | |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 320 | // Structural change: serialize the original as the rewrite value |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 321 | originalBytes, err := parser.SerializeToJSON(originalNode) |
| 322 | if err != nil { |
| Akron | 958fc47 | 2026-05-19 13:58:52 +0200 | [diff] [blame^] | 323 | return []ast.Rewrite{{Editor: RewriteEditor}} |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 324 | } |
| 325 | var originalJSON any |
| 326 | if err := json.Unmarshal(originalBytes, &originalJSON); err != nil { |
| Akron | 958fc47 | 2026-05-19 13:58:52 +0200 | [diff] [blame^] | 327 | return []ast.Rewrite{{Editor: RewriteEditor}} |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 328 | } |
| Akron | 958fc47 | 2026-05-19 13:58:52 +0200 | [diff] [blame^] | 329 | return []ast.Rewrite{{Editor: RewriteEditor, Original: originalJSON}} |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 330 | } |
| 331 | |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 332 | // isValidQueryObject returns true if data is a JSON object with an @type field. |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 333 | func isValidQueryObject(data any) bool { |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 334 | queryMap, ok := data.(map[string]any) |
| 335 | if !ok { |
| 336 | return false |
| 337 | } |
| Akron | 8414ae5 | 2026-05-19 13:31:14 +0200 | [diff] [blame] | 338 | _, ok = queryMap["@type"] |
| 339 | return ok |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 340 | } |