| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 1 | package mapper |
| 2 | |
| 3 | import ( |
| 4 | "fmt" |
| Akron | beee505 | 2026-05-20 09:39:45 +0200 | [diff] [blame] | 5 | "html" |
| Akron | a8b9fbc | 2026-03-05 16:43:05 +0100 | [diff] [blame] | 6 | "maps" |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 7 | "strings" |
| 8 | |
| Akron | 2ef703c | 2025-07-03 15:57:42 +0200 | [diff] [blame] | 9 | "github.com/KorAP/Koral-Mapper/ast" |
| 10 | "github.com/KorAP/Koral-Mapper/matcher" |
| 11 | "github.com/KorAP/Koral-Mapper/parser" |
| Akron | 9663af9 | 2026-02-20 13:45:08 +0100 | [diff] [blame] | 12 | "github.com/orisano/gosax" |
| Akron | a1337ef | 2025-07-01 12:28:03 +0200 | [diff] [blame] | 13 | "github.com/rs/zerolog/log" |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 14 | ) |
| 15 | |
| 16 | // ApplyResponseMappings applies the specified mapping rules to a JSON object |
| 17 | func (m *Mapper) ApplyResponseMappings(mappingID string, opts MappingOptions, jsonData any) (any, error) { |
| 18 | // Validate mapping ID |
| 19 | if _, exists := m.mappingLists[mappingID]; !exists { |
| 20 | return nil, fmt.Errorf("mapping list with ID %s not found", mappingID) |
| 21 | } |
| 22 | |
| Akron | 422cd25 | 2026-05-19 16:31:19 +0200 | [diff] [blame] | 23 | if err := m.validateEffectiveOptions(mappingID, opts); err != nil { |
| 24 | return nil, err |
| 25 | } |
| 26 | |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 27 | if m.mappingLists[mappingID].IsCorpus() { |
| 28 | return m.applyCorpusResponseMappings(mappingID, opts, jsonData) |
| 29 | } |
| 30 | |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 31 | // Get the parsed rules |
| Akron | 2f93c58 | 2026-02-19 16:49:13 +0100 | [diff] [blame] | 32 | rules := m.parsedQueryRules[mappingID] |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 33 | |
| 34 | // Check if we have a snippet to process |
| 35 | jsonMap, ok := jsonData.(map[string]any) |
| 36 | if !ok { |
| 37 | return jsonData, nil |
| 38 | } |
| 39 | |
| 40 | snippetValue, exists := jsonMap["snippet"] |
| 41 | if !exists { |
| 42 | return jsonData, nil |
| 43 | } |
| 44 | |
| 45 | snippet, ok := snippetValue.(string) |
| 46 | if !ok { |
| 47 | return jsonData, nil |
| 48 | } |
| 49 | |
| 50 | // Process the snippet with each rule |
| 51 | processedSnippet := snippet |
| Akron | 497cfe8 | 2025-07-03 13:26:54 +0200 | [diff] [blame] | 52 | for ruleIndex, rule := range rules { |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 53 | // Create pattern and replacement based on direction |
| 54 | var pattern, replacement ast.Node |
| 55 | if opts.Direction { // true means AtoB |
| 56 | pattern = rule.Upper |
| 57 | replacement = rule.Lower |
| 58 | } else { |
| 59 | pattern = rule.Lower |
| 60 | replacement = rule.Upper |
| 61 | } |
| 62 | |
| 63 | // Extract the inner nodes from the pattern and replacement tokens |
| 64 | if token, ok := pattern.(*ast.Token); ok { |
| 65 | pattern = token.Wrap |
| 66 | } |
| 67 | if token, ok := replacement.(*ast.Token); ok { |
| 68 | replacement = token.Wrap |
| 69 | } |
| 70 | |
| Akron | 497cfe8 | 2025-07-03 13:26:54 +0200 | [diff] [blame] | 71 | // Apply foundry and layer overrides with proper precedence |
| 72 | mappingList := m.mappingLists[mappingID] |
| 73 | |
| 74 | // Determine foundry and layer values based on direction |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 75 | var patternFoundry, patternLayer, replacementFoundry, replacementLayer string |
| Akron | 497cfe8 | 2025-07-03 13:26:54 +0200 | [diff] [blame] | 76 | if opts.Direction { // AtoB |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 77 | patternFoundry, patternLayer = opts.FoundryA, opts.LayerA |
| 78 | replacementFoundry, replacementLayer = opts.FoundryB, opts.LayerB |
| Akron | 497cfe8 | 2025-07-03 13:26:54 +0200 | [diff] [blame] | 79 | // Apply mapping list defaults if not specified |
| 80 | if replacementFoundry == "" { |
| 81 | replacementFoundry = mappingList.FoundryB |
| 82 | } |
| 83 | if replacementLayer == "" { |
| 84 | replacementLayer = mappingList.LayerB |
| 85 | } |
| 86 | } else { // BtoA |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 87 | patternFoundry, patternLayer = opts.FoundryB, opts.LayerB |
| 88 | replacementFoundry, replacementLayer = opts.FoundryA, opts.LayerA |
| Akron | 497cfe8 | 2025-07-03 13:26:54 +0200 | [diff] [blame] | 89 | // Apply mapping list defaults if not specified |
| 90 | if replacementFoundry == "" { |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 91 | replacementFoundry = mappingList.FoundryA |
| Akron | 4de47a9 | 2025-06-27 11:58:11 +0200 | [diff] [blame] | 92 | } |
| Akron | 497cfe8 | 2025-07-03 13:26:54 +0200 | [diff] [blame] | 93 | if replacementLayer == "" { |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 94 | replacementLayer = mappingList.LayerA |
| 95 | } |
| 96 | } |
| 97 | |
| Akron | 497cfe8 | 2025-07-03 13:26:54 +0200 | [diff] [blame] | 98 | // Clone pattern and apply foundry and layer overrides |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 99 | processedPattern := pattern.Clone() |
| 100 | if patternFoundry != "" || patternLayer != "" { |
| 101 | ast.ApplyFoundryAndLayerOverrides(processedPattern, patternFoundry, patternLayer) |
| 102 | } |
| 103 | |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 104 | // Create snippet matcher for this rule |
| 105 | snippetMatcher, err := matcher.NewSnippetMatcher( |
| 106 | ast.Pattern{Root: processedPattern}, |
| 107 | ast.Replacement{Root: replacement}, |
| 108 | ) |
| 109 | if err != nil { |
| 110 | continue // Skip this rule if we can't create a matcher |
| 111 | } |
| 112 | |
| 113 | // Find matching tokens in the snippet |
| 114 | matchingTokens, err := snippetMatcher.FindMatchingTokens(processedSnippet) |
| 115 | if err != nil { |
| 116 | continue // Skip this rule if parsing fails |
| 117 | } |
| 118 | |
| 119 | if len(matchingTokens) == 0 { |
| 120 | continue // No matches, try next rule |
| 121 | } |
| 122 | |
| Akron | 497cfe8 | 2025-07-03 13:26:54 +0200 | [diff] [blame] | 123 | // Apply RestrictToObligatory with layer precedence logic |
| 124 | restrictedReplacement := m.applyReplacementWithLayerPrecedence( |
| 125 | replacement, replacementFoundry, replacementLayer, |
| 126 | mappingID, ruleIndex, bool(opts.Direction)) |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 127 | if restrictedReplacement == nil { |
| 128 | continue // Nothing obligatory to add |
| 129 | } |
| 130 | |
| 131 | // Generate annotation strings from the restricted replacement |
| 132 | annotationStrings, err := m.generateAnnotationStrings(restrictedReplacement) |
| 133 | if err != nil { |
| 134 | continue // Skip if we can't generate annotations |
| 135 | } |
| 136 | |
| 137 | if len(annotationStrings) == 0 { |
| 138 | continue // Nothing to add |
| 139 | } |
| 140 | |
| 141 | // Apply annotations to matching tokens in the snippet |
| 142 | processedSnippet, err = m.addAnnotationsToSnippet(processedSnippet, matchingTokens, annotationStrings) |
| 143 | if err != nil { |
| 144 | continue // Skip if we can't apply annotations |
| 145 | } |
| 146 | } |
| 147 | |
| Akron | a1337ef | 2025-07-01 12:28:03 +0200 | [diff] [blame] | 148 | log.Debug().Str("snippet", processedSnippet).Msg("Processed snippet") |
| 149 | |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 150 | // Create a copy of the input data and update the snippet |
| 151 | result := make(map[string]any) |
| Akron | a8b9fbc | 2026-03-05 16:43:05 +0100 | [diff] [blame] | 152 | maps.Copy(result, jsonMap) |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 153 | result["snippet"] = processedSnippet |
| 154 | |
| 155 | return result, nil |
| 156 | } |
| 157 | |
| 158 | // generateAnnotationStrings converts a replacement AST node into annotation strings |
| 159 | func (m *Mapper) generateAnnotationStrings(node ast.Node) ([]string, error) { |
| 160 | if node == nil { |
| 161 | return nil, nil |
| 162 | } |
| 163 | |
| 164 | switch n := node.(type) { |
| 165 | case *ast.Term: |
| 166 | // Create annotation string in format "foundry/layer:key" or "foundry/layer:key:value" |
| 167 | annotation := n.Foundry + "/" + n.Layer + ":" + n.Key |
| 168 | if n.Value != "" { |
| 169 | annotation += ":" + n.Value |
| 170 | } |
| 171 | return []string{annotation}, nil |
| 172 | |
| 173 | case *ast.TermGroup: |
| 174 | if n.Relation == ast.AndRelation { |
| 175 | // For AND groups, collect all annotations |
| 176 | var allAnnotations []string |
| 177 | for _, operand := range n.Operands { |
| 178 | annotations, err := m.generateAnnotationStrings(operand) |
| 179 | if err != nil { |
| 180 | return nil, err |
| 181 | } |
| 182 | allAnnotations = append(allAnnotations, annotations...) |
| 183 | } |
| 184 | return allAnnotations, nil |
| 185 | } else { |
| 186 | // For OR groups (should not happen with RestrictToObligatory, but handle gracefully) |
| 187 | return nil, nil |
| 188 | } |
| 189 | |
| 190 | case *ast.Token: |
| 191 | // Handle wrapped tokens |
| 192 | if n.Wrap != nil { |
| 193 | return m.generateAnnotationStrings(n.Wrap) |
| 194 | } |
| 195 | return nil, nil |
| 196 | |
| 197 | default: |
| 198 | return nil, nil |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | // addAnnotationsToSnippet adds new annotations to matching tokens in the snippet |
| Akron | 9663af9 | 2026-02-20 13:45:08 +0100 | [diff] [blame] | 203 | // using SAX-based parsing for structural identification of text nodes. |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 204 | func (m *Mapper) addAnnotationsToSnippet(snippet string, matchingTokens []matcher.TokenSpan, annotationStrings []string) (string, error) { |
| 205 | if len(matchingTokens) == 0 || len(annotationStrings) == 0 { |
| 206 | return snippet, nil |
| 207 | } |
| 208 | |
| Akron | 9663af9 | 2026-02-20 13:45:08 +0100 | [diff] [blame] | 209 | tokenByStartPos := make(map[int]matcher.TokenSpan) |
| 210 | for _, tok := range matchingTokens { |
| 211 | tokenByStartPos[tok.StartPos] = tok |
| 212 | } |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 213 | |
| Akron | 9663af9 | 2026-02-20 13:45:08 +0100 | [diff] [blame] | 214 | reader := strings.NewReader(snippet) |
| 215 | r := gosax.NewReader(reader) |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 216 | |
| Akron | 9663af9 | 2026-02-20 13:45:08 +0100 | [diff] [blame] | 217 | var result strings.Builder |
| 218 | result.Grow(len(snippet) + len(matchingTokens)*100) |
| 219 | |
| 220 | var textPos int |
| 221 | |
| 222 | for { |
| 223 | e, err := r.Event() |
| 224 | if err != nil { |
| 225 | return "", fmt.Errorf("failed to parse snippet for annotation: %w", err) |
| 226 | } |
| 227 | if e.Type() == gosax.EventEOF { |
| 228 | break |
| 229 | } |
| 230 | |
| 231 | switch e.Type() { |
| 232 | case gosax.EventStart: |
| 233 | result.Write(e.Bytes) |
| 234 | |
| 235 | case gosax.EventEnd: |
| 236 | result.Write(e.Bytes) |
| 237 | |
| 238 | case gosax.EventText: |
| 239 | charData, err := gosax.CharData(e.Bytes) |
| 240 | if err != nil { |
| 241 | result.Write(e.Bytes) |
| 242 | break |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 243 | } |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 244 | |
| Akron | 9663af9 | 2026-02-20 13:45:08 +0100 | [diff] [blame] | 245 | text := string(charData) |
| 246 | trimmed := strings.TrimSpace(text) |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 247 | |
| Akron | 9663af9 | 2026-02-20 13:45:08 +0100 | [diff] [blame] | 248 | if token, ok := tokenByStartPos[textPos]; ok && trimmed != "" && trimmed == token.Text { |
| Akron | a8b9fbc | 2026-03-05 16:43:05 +0100 | [diff] [blame] | 249 | before, after, _ := strings.Cut(text, trimmed) |
| 250 | leadingWS := before |
| 251 | trailingWS := after |
| Akron | 9663af9 | 2026-02-20 13:45:08 +0100 | [diff] [blame] | 252 | |
| 253 | result.WriteString(leadingWS) |
| 254 | |
| 255 | annotated := escapeXMLText(trimmed) |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 256 | for i := len(annotationStrings) - 1; i >= 0; i-- { |
| Akron | beee505 | 2026-05-20 09:39:45 +0200 | [diff] [blame] | 257 | annotated = fmt.Sprintf(`<span title="%s" class="notinindex">%s</span>`, html.EscapeString(annotationStrings[i]), annotated) |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 258 | } |
| Akron | 9663af9 | 2026-02-20 13:45:08 +0100 | [diff] [blame] | 259 | result.WriteString(annotated) |
| 260 | result.WriteString(trailingWS) |
| 261 | } else { |
| 262 | result.Write(e.Bytes) |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 263 | } |
| 264 | |
| Akron | 9663af9 | 2026-02-20 13:45:08 +0100 | [diff] [blame] | 265 | textPos += len(text) |
| 266 | |
| 267 | default: |
| 268 | result.Write(e.Bytes) |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 269 | } |
| 270 | } |
| 271 | |
| Akron | 9663af9 | 2026-02-20 13:45:08 +0100 | [diff] [blame] | 272 | return result.String(), nil |
| 273 | } |
| 274 | |
| 275 | func escapeXMLText(s string) string { |
| 276 | s = strings.ReplaceAll(s, "&", "&") |
| 277 | s = strings.ReplaceAll(s, "<", "<") |
| 278 | s = strings.ReplaceAll(s, ">", ">") |
| 279 | return s |
| Akron | a3675e9 | 2025-06-26 17:46:59 +0200 | [diff] [blame] | 280 | } |
| Akron | 497cfe8 | 2025-07-03 13:26:54 +0200 | [diff] [blame] | 281 | |
| 282 | // applyReplacementWithLayerPrecedence applies RestrictToObligatory with proper layer precedence |
| 283 | func (m *Mapper) applyReplacementWithLayerPrecedence( |
| 284 | replacement ast.Node, foundry, layerOverride string, |
| 285 | mappingID string, ruleIndex int, direction bool) ast.Node { |
| 286 | |
| 287 | // First, apply RestrictToObligatory without layer override to preserve explicit layers |
| 288 | restricted := ast.RestrictToObligatory(replacement, foundry, "") |
| 289 | if restricted == nil { |
| 290 | return nil |
| 291 | } |
| 292 | |
| 293 | // If no layer override is specified, we're done |
| 294 | if layerOverride == "" { |
| 295 | return restricted |
| 296 | } |
| 297 | |
| 298 | // Apply layer override only to terms that didn't have explicit layers in the original rule |
| 299 | mappingList := m.mappingLists[mappingID] |
| 300 | if ruleIndex < len(mappingList.Mappings) { |
| 301 | originalRule := string(mappingList.Mappings[ruleIndex]) |
| 302 | m.applySelectiveLayerOverrides(restricted, layerOverride, originalRule, direction) |
| 303 | } |
| 304 | |
| 305 | return restricted |
| 306 | } |
| 307 | |
| 308 | // applySelectiveLayerOverrides applies layer overrides only to terms without explicit layers |
| 309 | func (m *Mapper) applySelectiveLayerOverrides(node ast.Node, layerOverride, originalRule string, direction bool) { |
| 310 | if node == nil { |
| 311 | return |
| 312 | } |
| 313 | |
| 314 | // Parse the original rule without defaults to detect explicit layers |
| 315 | explicitTerms := m.getExplicitTerms(originalRule, direction) |
| 316 | |
| 317 | // Apply overrides only to terms that weren't explicit in the original rule |
| 318 | termIndex := 0 |
| 319 | m.applyLayerOverrideToImplicitTerms(node, layerOverride, explicitTerms, &termIndex) |
| 320 | } |
| 321 | |
| 322 | // getExplicitTerms parses the original rule without defaults to identify terms with explicit layers |
| 323 | func (m *Mapper) getExplicitTerms(originalRule string, direction bool) map[int]bool { |
| 324 | explicitTerms := make(map[int]bool) |
| 325 | |
| 326 | // Parse without defaults to see what was explicitly specified |
| 327 | parser, err := parser.NewGrammarParser("", "") |
| 328 | if err != nil { |
| 329 | return explicitTerms |
| 330 | } |
| 331 | |
| 332 | result, err := parser.ParseMapping(originalRule) |
| 333 | if err != nil { |
| 334 | return explicitTerms |
| 335 | } |
| 336 | |
| 337 | // Get the replacement side based on direction |
| 338 | var replacement ast.Node |
| 339 | if direction { // AtoB |
| 340 | replacement = result.Lower.Wrap |
| 341 | } else { // BtoA |
| 342 | replacement = result.Upper.Wrap |
| 343 | } |
| 344 | |
| 345 | // Extract terms and check which ones have explicit layers |
| 346 | termIndex := 0 |
| 347 | m.markExplicitTerms(replacement, explicitTerms, &termIndex) |
| 348 | return explicitTerms |
| 349 | } |
| 350 | |
| 351 | // markExplicitTerms recursively marks terms that have explicit layers |
| 352 | func (m *Mapper) markExplicitTerms(node ast.Node, explicitTerms map[int]bool, termIndex *int) { |
| 353 | if node == nil { |
| 354 | return |
| 355 | } |
| 356 | |
| 357 | switch n := node.(type) { |
| 358 | case *ast.Term: |
| 359 | // A term has an explicit layer if it was specified in the original rule |
| 360 | if n.Layer != "" { |
| 361 | explicitTerms[*termIndex] = true |
| 362 | } |
| 363 | *termIndex++ |
| 364 | |
| 365 | case *ast.TermGroup: |
| 366 | for _, operand := range n.Operands { |
| 367 | m.markExplicitTerms(operand, explicitTerms, termIndex) |
| 368 | } |
| 369 | |
| 370 | case *ast.Token: |
| 371 | if n.Wrap != nil { |
| 372 | m.markExplicitTerms(n.Wrap, explicitTerms, termIndex) |
| 373 | } |
| 374 | } |
| 375 | } |
| 376 | |
| 377 | // applyLayerOverrideToImplicitTerms applies layer override only to terms not marked as explicit |
| 378 | func (m *Mapper) applyLayerOverrideToImplicitTerms(node ast.Node, layerOverride string, explicitTerms map[int]bool, termIndex *int) { |
| 379 | if node == nil { |
| 380 | return |
| 381 | } |
| 382 | |
| 383 | switch n := node.(type) { |
| 384 | case *ast.Term: |
| 385 | // Apply override only if this term wasn't explicit in the original rule |
| 386 | if !explicitTerms[*termIndex] && n.Layer != "" { |
| 387 | n.Layer = layerOverride |
| 388 | } |
| 389 | *termIndex++ |
| 390 | |
| 391 | case *ast.TermGroup: |
| 392 | for _, operand := range n.Operands { |
| 393 | m.applyLayerOverrideToImplicitTerms(operand, layerOverride, explicitTerms, termIndex) |
| 394 | } |
| 395 | |
| 396 | case *ast.Token: |
| 397 | if n.Wrap != nil { |
| 398 | m.applyLayerOverrideToImplicitTerms(n.Wrap, layerOverride, explicitTerms, termIndex) |
| 399 | } |
| 400 | } |
| 401 | } |