Fix corpus response parsing on "document"
Change-Id: Ib305c6430e997f5afdf07310844fab6c7f6443ac
diff --git a/cmd/koralmapper/main_test.go b/cmd/koralmapper/main_test.go
index ecbb5e8..64ea8be 100644
--- a/cmd/koralmapper/main_test.go
+++ b/cmd/koralmapper/main_test.go
@@ -700,18 +700,18 @@
assert.Equal(t, http.StatusOK, resp.StatusCode)
- var result map[string]interface{}
+ var result map[string]any
err = json.NewDecoder(resp.Body).Decode(&result)
require.NoError(t, err)
// Check that the mapping was applied
- wrap := result["wrap"].(map[string]interface{})
+ wrap := result["wrap"].(map[string]any)
if tc.expectGroup {
// For complex mappings, check the first operand
assert.Equal(t, "koral:termGroup", wrap["@type"])
- operands := wrap["operands"].([]interface{})
+ operands := wrap["operands"].([]any)
require.Greater(t, len(operands), 0)
- firstOperand := operands[0].(map[string]interface{})
+ firstOperand := operands[0].(map[string]any)
assert.Equal(t, tc.expectedKey, firstOperand["key"])
} else {
// For simple mappings, check the key directly
@@ -1111,16 +1111,16 @@
assert.Equal(t, http.StatusOK, resp.StatusCode)
- var result map[string]interface{}
+ var result map[string]any
err = json.NewDecoder(resp.Body).Decode(&result)
require.NoError(t, err)
// Verify the transformation was applied
- wrap := result["wrap"].(map[string]interface{})
+ wrap := result["wrap"].(map[string]any)
assert.Equal(t, "koral:termGroup", wrap["@type"])
- operands := wrap["operands"].([]interface{})
+ operands := wrap["operands"].([]any)
require.Greater(t, len(operands), 0)
- firstOperand := operands[0].(map[string]interface{})
+ firstOperand := operands[0].(map[string]any)
assert.Equal(t, "DET", firstOperand["key"])
}
diff --git a/mapper/benchmark_test.go b/mapper/benchmark_test.go
index fcf28a6..289cd8c 100644
--- a/mapper/benchmark_test.go
+++ b/mapper/benchmark_test.go
@@ -181,7 +181,7 @@
func BenchmarkApplyQueryMappingsWorstCase(b *testing.B) {
// Create a mapper with many rules
manyRules := make([]config.MappingRule, 100)
- for i := 0; i < 100; i++ {
+ for i := range 100 {
ruleChar := string(rune('A' + i%26))
manyRules[i] = config.MappingRule("[UNUSED" + ruleChar + "] <> [TARGET" + ruleChar + "]")
}
diff --git a/mapper/corpus.go b/mapper/corpus.go
index 9aad21e..f3f2b0c 100644
--- a/mapper/corpus.go
+++ b/mapper/corpus.go
@@ -1,7 +1,9 @@
package mapper
import (
+ "maps"
"regexp"
+ "slices"
"github.com/KorAP/Koral-Mapper/parser"
)
@@ -390,12 +392,7 @@
return jsonData, nil
}
- fieldsRaw, exists := jsonMap["fields"]
- if !exists {
- return jsonData, nil
- }
-
- fields, ok := fieldsRaw.([]any)
+ fieldsInDocument, fields, ok := extractResponseFieldsContainer(jsonMap)
if !ok {
return jsonData, nil
}
@@ -421,11 +418,52 @@
newFields = append(newFields, mapped...)
}
+ fieldValues := collectResponseFieldValues(fields)
+ newFields = append(newFields, m.matchGroupPatternsAndCollect(fieldValues, rules, opts)...)
+
result := shallowCopyMap(jsonMap)
- result["fields"] = newFields
+ if !fieldsInDocument {
+ result["fields"] = newFields
+ return result, nil
+ }
+
+ if document, ok := jsonMap["document"].(map[string]any); ok {
+ documentCopy := shallowCopyMap(document)
+ documentCopy["fields"] = newFields
+ result["document"] = documentCopy
+ }
+
return result, nil
}
+// extractResponseFieldsContainer finds the response field array either at
+// top-level ("fields") or in document-level ("document.fields").
+func extractResponseFieldsContainer(jsonMap map[string]any) (bool, []any, bool) {
+ if fieldsRaw, exists := jsonMap["fields"]; exists {
+ if fields, ok := fieldsRaw.([]any); ok {
+ return false, fields, true
+ }
+ }
+
+ documentRaw, exists := jsonMap["document"]
+ if !exists {
+ return false, nil, false
+ }
+ document, ok := documentRaw.(map[string]any)
+ if !ok {
+ return false, nil, false
+ }
+ fieldsRaw, exists := document["fields"]
+ if !exists {
+ return false, nil, false
+ }
+ fields, ok := fieldsRaw.([]any)
+ if !ok {
+ return false, nil, false
+ }
+ return true, fields, true
+}
+
// matchFieldAndCollect matches a field's key/value against rules and returns mapped entries.
// For array values, each element is matched individually.
func (m *Mapper) matchFieldAndCollect(key string, value any, rules []*parser.CorpusMappingResult, opts MappingOptions) []any {
@@ -474,6 +512,118 @@
return results
}
+// matchGroupPatternsAndCollect matches group-based rule patterns against the
+// complete set of response field values (e.g. AND combinations across
+// multi-valued textClass fields).
+func (m *Mapper) matchGroupPatternsAndCollect(values map[string][]string, rules []*parser.CorpusMappingResult, opts MappingOptions) []any {
+ var results []any
+
+ for _, rule := range rules {
+ var pattern, replacement parser.CorpusNode
+ if opts.Direction == AtoB {
+ pattern, replacement = rule.Upper, rule.Lower
+ } else {
+ pattern, replacement = rule.Lower, rule.Upper
+ }
+
+ if !patternNeedsAggregateMatching(pattern) {
+ continue
+ }
+ if !matchCorpusPatternAgainstValues(pattern, values) {
+ continue
+ }
+
+ results = append(results, collectReplacementFields(replacement)...)
+ }
+
+ return results
+}
+
+func collectResponseFieldValues(fields []any) map[string][]string {
+ values := make(map[string][]string)
+
+ for _, fieldRaw := range fields {
+ fieldMap, ok := fieldRaw.(map[string]any)
+ if !ok {
+ continue
+ }
+
+ fieldKey, _ := fieldMap["key"].(string)
+ if fieldKey == "" {
+ continue
+ }
+
+ switch v := fieldMap["value"].(type) {
+ case string:
+ values[fieldKey] = append(values[fieldKey], v)
+ case []any:
+ for _, elem := range v {
+ if s, ok := elem.(string); ok {
+ values[fieldKey] = append(values[fieldKey], s)
+ }
+ }
+ }
+ }
+
+ return values
+}
+
+func matchCorpusPatternAgainstValues(pattern parser.CorpusNode, values map[string][]string) bool {
+ switch p := pattern.(type) {
+ case *parser.CorpusField:
+ if p.Key == "" {
+ for key, keyValues := range values {
+ for _, value := range keyValues {
+ if matchCorpusField(p, map[string]any{"key": key, "value": value}) {
+ return true
+ }
+ }
+ }
+ return false
+ }
+ for _, value := range values[p.Key] {
+ if matchCorpusField(p, map[string]any{"key": p.Key, "value": value}) {
+ return true
+ }
+ }
+ return false
+
+ case *parser.CorpusGroup:
+ if p.Operation == "or" {
+ for _, op := range p.Operands {
+ if matchCorpusPatternAgainstValues(op, values) {
+ return true
+ }
+ }
+ return false
+ }
+
+ for _, op := range p.Operands {
+ if !matchCorpusPatternAgainstValues(op, values) {
+ return false
+ }
+ }
+ return true
+ }
+
+ return false
+}
+
+func patternNeedsAggregateMatching(pattern parser.CorpusNode) bool {
+ switch p := pattern.(type) {
+ case *parser.CorpusField:
+ return false
+ case *parser.CorpusGroup:
+ if p.Operation == "and" {
+ return true
+ }
+ if slices.ContainsFunc(p.Operands, patternNeedsAggregateMatching) {
+ return true
+ }
+ }
+ return false
+}
+
// matchCorpusFieldPattern checks if a single response field matches a pattern.
// Field patterns match directly. OR group patterns match if any operand matches.
// AND group patterns cannot match a single field.
@@ -529,9 +679,7 @@
func shallowCopyMap(m map[string]any) map[string]any {
result := make(map[string]any, len(m))
- for k, v := range m {
- result[k] = v
- }
+ maps.Copy(result, m)
return result
}
@@ -571,4 +719,3 @@
}
}
}
-
diff --git a/mapper/corpus_test.go b/mapper/corpus_test.go
index c5e129c..27a1a3c 100644
--- a/mapper/corpus_test.go
+++ b/mapper/corpus_test.go
@@ -1,6 +1,8 @@
package mapper
import (
+ "encoding/json"
+ "os"
"testing"
"github.com/KorAP/Koral-Mapper/config"
@@ -598,6 +600,45 @@
assert.Equal(t, true, mapped2["mapped"])
}
+func TestCorpusResponseWikiDeReKoFixtureEnrichment(t *testing.T) {
+ cfg, err := config.LoadFromSources("", []string{"../mappings/wiki-dereko.yaml"})
+ require.NoError(t, err)
+
+ m, err := NewMapper(cfg.Lists)
+ require.NoError(t, err)
+
+ raw, err := os.ReadFile("../testdata/corpus-response.json")
+ require.NoError(t, err)
+
+ var input map[string]any
+ require.NoError(t, json.Unmarshal(raw, &input))
+
+ result, err := m.ApplyResponseMappings("wiki-dereko", MappingOptions{Direction: BtoA}, input)
+ require.NoError(t, err)
+
+ resultMap := result.(map[string]any)
+ document := resultMap["document"].(map[string]any)
+ fields := document["fields"].([]any)
+
+ var wikiValues []string
+ for _, fieldRaw := range fields {
+ field, ok := fieldRaw.(map[string]any)
+ if !ok {
+ continue
+ }
+ key, _ := field["key"].(string)
+ if key != "wikiCat" {
+ continue
+ }
+ if value, ok := field["value"].(string); ok {
+ wikiValues = append(wikiValues, value)
+ }
+ }
+
+ assert.NotEmpty(t, wikiValues, "expected wiki categories to be enriched from textClass values")
+ assert.Contains(t, wikiValues, "Science")
+}
+
func TestCorpusResponseRegexMatch(t *testing.T) {
m := newCorpusMapper(t, "textClass=wissenschaft.*#regex <> genre=science")
diff --git a/mapper/response.go b/mapper/response.go
index 14fd978..e870a29 100644
--- a/mapper/response.go
+++ b/mapper/response.go
@@ -2,6 +2,7 @@
import (
"fmt"
+ "maps"
"strings"
"github.com/KorAP/Koral-Mapper/ast"
@@ -143,9 +144,7 @@
// Create a copy of the input data and update the snippet
result := make(map[string]any)
- for k, v := range jsonMap {
- result[k] = v
- }
+ maps.Copy(result, jsonMap)
result["snippet"] = processedSnippet
return result, nil
@@ -242,9 +241,9 @@
trimmed := strings.TrimSpace(text)
if token, ok := tokenByStartPos[textPos]; ok && trimmed != "" && trimmed == token.Text {
- trimStart := strings.Index(text, trimmed)
- leadingWS := text[:trimStart]
- trailingWS := text[trimStart+len(trimmed):]
+ before, after, _ := strings.Cut(text, trimmed)
+ leadingWS := before
+ trailingWS := after
result.WriteString(leadingWS)
diff --git a/matcher/snippet_matcher_test.go b/matcher/snippet_matcher_test.go
index 3142f9f..9b3a2fd 100644
--- a/matcher/snippet_matcher_test.go
+++ b/matcher/snippet_matcher_test.go
@@ -1,6 +1,7 @@
package matcher
import (
+ "slices"
"testing"
"github.com/KorAP/Koral-Mapper/ast"
@@ -269,13 +270,7 @@
assert.Equal(t, expectedTexts[i], token.Text)
// Verify that each token has the required annotation
- hasGenderMasc := false
- for _, annotation := range token.Annotations {
- if annotation == "marmot/m:gender:masc" {
- hasGenderMasc = true
- break
- }
- }
+ hasGenderMasc := slices.Contains(token.Annotations, "marmot/m:gender:masc")
assert.True(t, hasGenderMasc, "Token %s should have marmot/m:gender:masc annotation", token.Text)
}
}
diff --git a/parser/corpus_parser.go b/parser/corpus_parser.go
index 9debf3a..187a618 100644
--- a/parser/corpus_parser.go
+++ b/parser/corpus_parser.go
@@ -95,13 +95,13 @@
// ParseMapping parses a corpus mapping rule of the form "pattern <> replacement".
func (p *CorpusParser) ParseMapping(input string) (*CorpusMappingResult, error) {
- sepIdx := strings.Index(input, "<>")
- if sepIdx == -1 {
+ before, after, ok := strings.Cut(input, "<>")
+ if !ok {
return nil, fmt.Errorf("invalid corpus mapping rule: missing <> separator in %q", input)
}
- leftStr := strings.TrimSpace(input[:sepIdx])
- rightStr := strings.TrimSpace(input[sepIdx+2:])
+ leftStr := strings.TrimSpace(before)
+ rightStr := strings.TrimSpace(after)
if leftStr == "" {
return nil, fmt.Errorf("invalid corpus mapping rule: empty left side")
@@ -183,16 +183,16 @@
func (p *CorpusParser) parseField(input string) (*CorpusField, error) {
input = strings.TrimSpace(input)
- eqIdx := strings.Index(input, "=")
- if eqIdx == -1 {
+ before, after, ok := strings.Cut(input, "=")
+ if !ok {
if !p.AllowBareValues {
return nil, fmt.Errorf("invalid field expression: missing '=' in %q", input)
}
return p.parseBareValue(input)
}
- key := strings.TrimSpace(input[:eqIdx])
- rest := strings.TrimSpace(input[eqIdx+1:])
+ key := strings.TrimSpace(before)
+ rest := strings.TrimSpace(after)
if key == "" {
return nil, fmt.Errorf("invalid field expression: empty key")
diff --git a/parser/grammar_parser.go b/parser/grammar_parser.go
index 7c949f8..a224181 100644
--- a/parser/grammar_parser.go
+++ b/parser/grammar_parser.go
@@ -156,7 +156,7 @@
// Check if this parenthesis is inside brackets (part of an identifier)
insideBrackets := false
bracketDepth := 0
- for j := 0; j < i; j++ {
+ for j := range i {
if runes[j] == '[' {
bracketDepth++
} else if runes[j] == ']' {
diff --git a/testdata/corpus-response.json b/testdata/corpus-response.json
new file mode 100644
index 0000000..450de33
--- /dev/null
+++ b/testdata/corpus-response.json
@@ -0,0 +1 @@
+{"@context":"http:\/\/korap.ids-mannheim.de\/ns\/KoralQuery\/v0.3\/context.jsonld","document":{"@type":"koral:document","fields":[{"@type":"koral:field","key":"externalLink","type":"type:attachement","value":"data:application\/x.korap-link;title=Wikipedia,http:\/\/de.wikipedia.org\/wiki\/Benutzer_Diskussion:Dalue2"},{"@type":"koral:field","key":"docSigle","type":"type:string","value":"WUD17\/D96"},{"@type":"koral:field","key":"language","type":"type:string","value":"de"},{"@type":"koral:field","key":"textTypeArt","type":"type:string","value":"Benutzerdiskussion"},{"@type":"koral:field","key":"docTitle","type":"type:text","value":"Wikipedia, Benutzerdiskussionen mit Anfangsbuchstabe D, Teil 96"},{"@type":"koral:field","key":"availability","type":"type:string","value":"CC-BY-SA"},{"@type":"koral:field","key":"title","type":"type:text","value":"Benutzer Diskussion:Dalue2"},{"@type":"koral:field","key":"reference","type":"type:attachement","value":"data:,Benutzer Diskussion:Dalue2, In: Wikipedia - URL:http:\/\/de.wikipedia.org\/wiki\/Benutzer_Diskussion:Dalue2: Wikipedia, 2017"},{"@type":"koral:field","key":"textClass","type":"type:keywords","value":["wissenschaft","populaerwissenschaft","technik-industrie","edv-elektronik"]},{"@type":"koral:field","key":"indexCreationDate","type":"type:date","value":"2019-02-27"},{"@type":"koral:field","key":"layerInfos","type":"type:store","value":"corenlp\/c=spans corenlp\/p=tokens corenlp\/s=spans dereko\/s=spans malt\/d=rels marmot\/m=tokens marmot\/p=tokens opennlp\/p=tokens opennlp\/s=spans tt\/l=tokens tt\/p=tokens"},{"@type":"koral:field","key":"pubPlace","type":"type:string","value":"URL:http:\/\/de.wikipedia.org"},{"@type":"koral:field","key":"corpusSigle","type":"type:string","value":"WUD17"},{"@type":"koral:field","key":"corpusEditor","type":"type:attachement","value":"data:,wikipedia.org"},{"@type":"koral:field","key":"editor","type":"type:attachement","value":"data:,wikipedia.org"},{"@type":"koral:field","key":"textSigle","type":"type:string","value":"WUD17\/D96\/32955"},{"@type":"koral:field","key":"author","type":"type:text","value":"Xneb20, u.a."},{"@type":"koral:field","key":"textType","type":"type:string","value":"Benutzerdiskussionen"},{"@type":"koral:field","key":"foundries","type":"type:keywords","value":["corenlp","corenlp\/constituency","corenlp\/morpho","corenlp\/sentences","dereko","dereko\/structure","dereko\/structure\/base-sentences-paragraphs-pagebreaks","malt","malt\/dependency","marmot","marmot\/morpho","opennlp","opennlp\/morpho","opennlp\/sentences","treetagger","treetagger\/morpho"]},{"@type":"koral:field","key":"creationDate","type":"type:date","value":"2016-10-24"},{"@type":"koral:field","key":"pubDate","type":"type:date","value":"2017-07-01"},{"@type":"koral:field","key":"tokenSource","type":"type:store","value":"base#tokens"},{"@type":"koral:field","key":"indexLastModified","type":"type:date","value":"2019-02-27"},{"@type":"koral:field","key":"publisher","type":"type:attachement","value":"data:,Wikipedia"},{"@type":"koral:field","key":"corpusTitle","type":"type:text","value":"Wikipedia"}]},"messages":[["Response format is temporary"]],"meta":{}}