Fix corpus response parsing on "document"
Change-Id: Ib305c6430e997f5afdf07310844fab6c7f6443ac
diff --git a/mapper/benchmark_test.go b/mapper/benchmark_test.go
index fcf28a6..289cd8c 100644
--- a/mapper/benchmark_test.go
+++ b/mapper/benchmark_test.go
@@ -181,7 +181,7 @@
func BenchmarkApplyQueryMappingsWorstCase(b *testing.B) {
// Create a mapper with many rules
manyRules := make([]config.MappingRule, 100)
- for i := 0; i < 100; i++ {
+ for i := range 100 {
ruleChar := string(rune('A' + i%26))
manyRules[i] = config.MappingRule("[UNUSED" + ruleChar + "] <> [TARGET" + ruleChar + "]")
}
diff --git a/mapper/corpus.go b/mapper/corpus.go
index 9aad21e..f3f2b0c 100644
--- a/mapper/corpus.go
+++ b/mapper/corpus.go
@@ -1,7 +1,9 @@
package mapper
import (
+ "maps"
"regexp"
+ "slices"
"github.com/KorAP/Koral-Mapper/parser"
)
@@ -390,12 +392,7 @@
return jsonData, nil
}
- fieldsRaw, exists := jsonMap["fields"]
- if !exists {
- return jsonData, nil
- }
-
- fields, ok := fieldsRaw.([]any)
+ fieldsInDocument, fields, ok := extractResponseFieldsContainer(jsonMap)
if !ok {
return jsonData, nil
}
@@ -421,11 +418,52 @@
newFields = append(newFields, mapped...)
}
+ fieldValues := collectResponseFieldValues(fields)
+ newFields = append(newFields, m.matchGroupPatternsAndCollect(fieldValues, rules, opts)...)
+
result := shallowCopyMap(jsonMap)
- result["fields"] = newFields
+ if !fieldsInDocument {
+ result["fields"] = newFields
+ return result, nil
+ }
+
+ if document, ok := jsonMap["document"].(map[string]any); ok {
+ documentCopy := shallowCopyMap(document)
+ documentCopy["fields"] = newFields
+ result["document"] = documentCopy
+ }
+
return result, nil
}
+// extractResponseFieldsContainer finds the response field array either at
+// top-level ("fields") or in document-level ("document.fields").
+func extractResponseFieldsContainer(jsonMap map[string]any) (bool, []any, bool) {
+ if fieldsRaw, exists := jsonMap["fields"]; exists {
+ if fields, ok := fieldsRaw.([]any); ok {
+ return false, fields, true
+ }
+ }
+
+ documentRaw, exists := jsonMap["document"]
+ if !exists {
+ return false, nil, false
+ }
+ document, ok := documentRaw.(map[string]any)
+ if !ok {
+ return false, nil, false
+ }
+ fieldsRaw, exists := document["fields"]
+ if !exists {
+ return false, nil, false
+ }
+ fields, ok := fieldsRaw.([]any)
+ if !ok {
+ return false, nil, false
+ }
+ return true, fields, true
+}
+
// matchFieldAndCollect matches a field's key/value against rules and returns mapped entries.
// For array values, each element is matched individually.
func (m *Mapper) matchFieldAndCollect(key string, value any, rules []*parser.CorpusMappingResult, opts MappingOptions) []any {
@@ -474,6 +512,118 @@
return results
}
+// matchGroupPatternsAndCollect matches group-based rule patterns against the
+// complete set of response field values (e.g. AND combinations across
+// multi-valued textClass fields).
+func (m *Mapper) matchGroupPatternsAndCollect(values map[string][]string, rules []*parser.CorpusMappingResult, opts MappingOptions) []any {
+ var results []any
+
+ for _, rule := range rules {
+ var pattern, replacement parser.CorpusNode
+ if opts.Direction == AtoB {
+ pattern, replacement = rule.Upper, rule.Lower
+ } else {
+ pattern, replacement = rule.Lower, rule.Upper
+ }
+
+ if !patternNeedsAggregateMatching(pattern) {
+ continue
+ }
+ if !matchCorpusPatternAgainstValues(pattern, values) {
+ continue
+ }
+
+ results = append(results, collectReplacementFields(replacement)...)
+ }
+
+ return results
+}
+
+func collectResponseFieldValues(fields []any) map[string][]string {
+ values := make(map[string][]string)
+
+ for _, fieldRaw := range fields {
+ fieldMap, ok := fieldRaw.(map[string]any)
+ if !ok {
+ continue
+ }
+
+ fieldKey, _ := fieldMap["key"].(string)
+ if fieldKey == "" {
+ continue
+ }
+
+ switch v := fieldMap["value"].(type) {
+ case string:
+ values[fieldKey] = append(values[fieldKey], v)
+ case []any:
+ for _, elem := range v {
+ if s, ok := elem.(string); ok {
+ values[fieldKey] = append(values[fieldKey], s)
+ }
+ }
+ }
+ }
+
+ return values
+}
+
+func matchCorpusPatternAgainstValues(pattern parser.CorpusNode, values map[string][]string) bool {
+ switch p := pattern.(type) {
+ case *parser.CorpusField:
+ if p.Key == "" {
+ for key, keyValues := range values {
+ for _, value := range keyValues {
+ if matchCorpusField(p, map[string]any{"key": key, "value": value}) {
+ return true
+ }
+ }
+ }
+ return false
+ }
+ for _, value := range values[p.Key] {
+ if matchCorpusField(p, map[string]any{"key": p.Key, "value": value}) {
+ return true
+ }
+ }
+ return false
+
+ case *parser.CorpusGroup:
+ if p.Operation == "or" {
+ for _, op := range p.Operands {
+ if matchCorpusPatternAgainstValues(op, values) {
+ return true
+ }
+ }
+ return false
+ }
+
+ for _, op := range p.Operands {
+ if !matchCorpusPatternAgainstValues(op, values) {
+ return false
+ }
+ }
+ return true
+ }
+
+ return false
+}
+
+func patternNeedsAggregateMatching(pattern parser.CorpusNode) bool {
+ switch p := pattern.(type) {
+ case *parser.CorpusField:
+ return false
+ case *parser.CorpusGroup:
+ if p.Operation == "and" {
+ return true
+ }
+ if slices.ContainsFunc(p.Operands, patternNeedsAggregateMatching) {
+ return true
+ }
+ }
+ return false
+}
+
// matchCorpusFieldPattern checks if a single response field matches a pattern.
// Field patterns match directly. OR group patterns match if any operand matches.
// AND group patterns cannot match a single field.
@@ -529,9 +679,7 @@
func shallowCopyMap(m map[string]any) map[string]any {
result := make(map[string]any, len(m))
- for k, v := range m {
- result[k] = v
- }
+ maps.Copy(result, m)
return result
}
@@ -571,4 +719,3 @@
}
}
}
-
diff --git a/mapper/corpus_test.go b/mapper/corpus_test.go
index c5e129c..27a1a3c 100644
--- a/mapper/corpus_test.go
+++ b/mapper/corpus_test.go
@@ -1,6 +1,8 @@
package mapper
import (
+ "encoding/json"
+ "os"
"testing"
"github.com/KorAP/Koral-Mapper/config"
@@ -598,6 +600,45 @@
assert.Equal(t, true, mapped2["mapped"])
}
+func TestCorpusResponseWikiDeReKoFixtureEnrichment(t *testing.T) {
+ cfg, err := config.LoadFromSources("", []string{"../mappings/wiki-dereko.yaml"})
+ require.NoError(t, err)
+
+ m, err := NewMapper(cfg.Lists)
+ require.NoError(t, err)
+
+ raw, err := os.ReadFile("../testdata/corpus-response.json")
+ require.NoError(t, err)
+
+ var input map[string]any
+ require.NoError(t, json.Unmarshal(raw, &input))
+
+ result, err := m.ApplyResponseMappings("wiki-dereko", MappingOptions{Direction: BtoA}, input)
+ require.NoError(t, err)
+
+ resultMap := result.(map[string]any)
+ document := resultMap["document"].(map[string]any)
+ fields := document["fields"].([]any)
+
+ var wikiValues []string
+ for _, fieldRaw := range fields {
+ field, ok := fieldRaw.(map[string]any)
+ if !ok {
+ continue
+ }
+ key, _ := field["key"].(string)
+ if key != "wikiCat" {
+ continue
+ }
+ if value, ok := field["value"].(string); ok {
+ wikiValues = append(wikiValues, value)
+ }
+ }
+
+ assert.NotEmpty(t, wikiValues, "expected wiki categories to be enriched from textClass values")
+ assert.Contains(t, wikiValues, "Science")
+}
+
func TestCorpusResponseRegexMatch(t *testing.T) {
m := newCorpusMapper(t, "textClass=wissenschaft.*#regex <> genre=science")
diff --git a/mapper/response.go b/mapper/response.go
index 14fd978..e870a29 100644
--- a/mapper/response.go
+++ b/mapper/response.go
@@ -2,6 +2,7 @@
import (
"fmt"
+ "maps"
"strings"
"github.com/KorAP/Koral-Mapper/ast"
@@ -143,9 +144,7 @@
// Create a copy of the input data and update the snippet
result := make(map[string]any)
- for k, v := range jsonMap {
- result[k] = v
- }
+ maps.Copy(result, jsonMap)
result["snippet"] = processedSnippet
return result, nil
@@ -242,9 +241,9 @@
trimmed := strings.TrimSpace(text)
if token, ok := tokenByStartPos[textPos]; ok && trimmed != "" && trimmed == token.Text {
- trimStart := strings.Index(text, trimmed)
- leadingWS := text[:trimStart]
- trailingWS := text[trimStart+len(trimmed):]
+ before, after, _ := strings.Cut(text, trimmed)
+ leadingWS := before
+ trailingWS := after
result.WriteString(leadingWS)