Precompile corpus regex to prevent per-request compilation

Change-Id: Ib78258602ceb4ae5ac6daa84e8ea08965157adfd
diff --git a/mapper/corpus.go b/mapper/corpus.go
index 1a5db5b..40446de 100644
--- a/mapper/corpus.go
+++ b/mapper/corpus.go
@@ -2,7 +2,6 @@
 
 import (
 	"maps"
-	"regexp"
 	"slices"
 
 	"github.com/KorAP/Koral-Mapper/ast"
@@ -68,7 +67,7 @@
 		pattern, replacement = rule.Lower, rule.Upper
 	}
 
-	if matchCorpusNode(pattern, node) {
+	if m.matchCorpusNode(pattern, node) {
 		// AND subset match: node has more operands than pattern
 		if pg, ok := pattern.(*parser.CorpusGroup); ok && pg.Operation == "and" {
 			operandsRaw, _ := node["operands"].([]any)
@@ -126,7 +125,7 @@
 			if !ok {
 				continue
 			}
-			if matchCorpusNode(patOp, docOp) {
+			if m.matchCorpusNode(patOp, docOp) {
 				used[j] = true
 				break
 			}
@@ -167,16 +166,16 @@
 // For CorpusField patterns, the node must be a koral:doc/koral:field.
 // For CorpusGroup patterns, the node must be a koral:docGroup/koral:fieldGroup
 // with matching operation and exactly matching operands (commutative).
-func matchCorpusNode(pattern parser.CorpusNode, node map[string]any) bool {
+func (m *Mapper) matchCorpusNode(pattern parser.CorpusNode, node map[string]any) bool {
 	switch p := pattern.(type) {
 	case *parser.CorpusField:
 		atType, _ := node["@type"].(string)
 		if atType != "koral:doc" && atType != "koral:field" {
 			return false
 		}
-		return matchCorpusField(p, node)
+		return m.matchCorpusField(p, node)
 	case *parser.CorpusGroup:
-		return matchCorpusGroupNode(p, node)
+		return m.matchCorpusGroupNode(p, node)
 	}
 	return false
 }
@@ -190,14 +189,14 @@
 // AND patterns: the node must be a docGroup/fieldGroup with AND operation
 // and all pattern operands must be found (subset matching — the node may
 // have additional operands beyond those in the pattern).
-func matchCorpusGroupNode(pattern *parser.CorpusGroup, node map[string]any) bool {
+func (m *Mapper) matchCorpusGroupNode(pattern *parser.CorpusGroup, node map[string]any) bool {
 	atType, _ := node["@type"].(string)
 
 	if pattern.Operation == "or" {
 		// Leaf nodes: any-operand matching
 		if atType == "koral:doc" || atType == "koral:field" {
 			for _, op := range pattern.Operands {
-				if matchCorpusNode(op, node) {
+				if m.matchCorpusNode(op, node) {
 					return true
 				}
 			}
@@ -211,7 +210,7 @@
 		if operation != "operation:or" {
 			return false
 		}
-		return matchGroupOperands(pattern.Operands, node, true)
+		return m.matchGroupOperands(pattern.Operands, node, true)
 	}
 
 	// AND patterns: subset matching
@@ -222,14 +221,14 @@
 	if operation != "operation:and" {
 		return false
 	}
-	return matchGroupOperands(pattern.Operands, node, false)
+	return m.matchGroupOperands(pattern.Operands, node, false)
 }
 
 // matchGroupOperands checks if a docGroup's operands match a pattern's
 // operands using commutative set matching. When exactCount is true, the
 // operand counts must be equal; otherwise subset matching is used (the
 // node may have more operands than the pattern).
-func matchGroupOperands(patternOps []parser.CorpusNode, node map[string]any, exactCount bool) bool {
+func (m *Mapper) matchGroupOperands(patternOps []parser.CorpusNode, node map[string]any, exactCount bool) bool {
 	operandsRaw, ok := node["operands"].([]any)
 	if !ok {
 		return false
@@ -255,7 +254,7 @@
 			if !ok {
 				continue
 			}
-			if matchCorpusNode(patOp, docOp) {
+			if m.matchCorpusNode(patOp, docOp) {
 				used[j] = true
 				found = true
 				break
@@ -269,7 +268,7 @@
 }
 
 // matchCorpusField checks if a koral:doc JSON node matches a CorpusField pattern.
-func matchCorpusField(pattern *parser.CorpusField, doc map[string]any) bool {
+func (m *Mapper) matchCorpusField(pattern *parser.CorpusField, doc map[string]any) bool {
 	docKey, _ := doc["key"].(string)
 	if docKey != pattern.Key {
 		return false
@@ -277,11 +276,8 @@
 
 	docValue, _ := doc["value"].(string)
 	if pattern.Type == "regex" {
-		re, err := regexp.Compile("^" + pattern.Value + "$")
-		if err != nil {
-			return false
-		}
-		if !re.MatchString(docValue) {
+		re := m.compiledRegexes["^"+pattern.Value+"$"]
+		if re == nil || !re.MatchString(docValue) {
 			return false
 		}
 	} else if docValue != pattern.Value {
@@ -501,7 +497,7 @@
 			pattern, replacement = rule.Lower, rule.Upper
 		}
 
-		if !matchCorpusFieldPattern(pattern, pseudoDoc) {
+		if !m.matchCorpusFieldPattern(pattern, pseudoDoc) {
 			continue
 		}
 
@@ -528,7 +524,7 @@
 		if !patternNeedsAggregateMatching(pattern) {
 			continue
 		}
-		if !matchCorpusPatternAgainstValues(pattern, values) {
+		if !m.matchCorpusPatternAgainstValues(pattern, values) {
 			continue
 		}
 
@@ -567,13 +563,13 @@
 	return values
 }
 
-func matchCorpusPatternAgainstValues(pattern parser.CorpusNode, values map[string][]string) bool {
+func (m *Mapper) matchCorpusPatternAgainstValues(pattern parser.CorpusNode, values map[string][]string) bool {
 	switch p := pattern.(type) {
 	case *parser.CorpusField:
 		if p.Key == "" {
 			for key, keyValues := range values {
 				for _, value := range keyValues {
-					if matchCorpusField(p, map[string]any{"key": key, "value": value}) {
+					if m.matchCorpusField(p, map[string]any{"key": key, "value": value}) {
 						return true
 					}
 				}
@@ -581,7 +577,7 @@
 			return false
 		}
 		for _, value := range values[p.Key] {
-			if matchCorpusField(p, map[string]any{"key": p.Key, "value": value}) {
+			if m.matchCorpusField(p, map[string]any{"key": p.Key, "value": value}) {
 				return true
 			}
 		}
@@ -590,7 +586,7 @@
 	case *parser.CorpusGroup:
 		if p.Operation == "or" {
 			for _, op := range p.Operands {
-				if matchCorpusPatternAgainstValues(op, values) {
+				if m.matchCorpusPatternAgainstValues(op, values) {
 					return true
 				}
 			}
@@ -598,7 +594,7 @@
 		}
 
 		for _, op := range p.Operands {
-			if !matchCorpusPatternAgainstValues(op, values) {
+			if !m.matchCorpusPatternAgainstValues(op, values) {
 				return false
 			}
 		}
@@ -626,14 +622,14 @@
 // matchCorpusFieldPattern checks if a single response field matches a pattern.
 // Field patterns match directly. OR group patterns match if any operand matches.
 // AND group patterns cannot match a single field.
-func matchCorpusFieldPattern(pattern parser.CorpusNode, doc map[string]any) bool {
+func (m *Mapper) matchCorpusFieldPattern(pattern parser.CorpusNode, doc map[string]any) bool {
 	switch p := pattern.(type) {
 	case *parser.CorpusField:
-		return matchCorpusField(p, doc)
+		return m.matchCorpusField(p, doc)
 	case *parser.CorpusGroup:
 		if p.Operation == "or" {
 			for _, op := range p.Operands {
-				if matchCorpusFieldPattern(op, doc) {
+				if m.matchCorpusFieldPattern(op, doc) {
 					return true
 				}
 			}
diff --git a/mapper/corpus_test.go b/mapper/corpus_test.go
index 8ab9be2..cc2381b 100644
--- a/mapper/corpus_test.go
+++ b/mapper/corpus_test.go
@@ -293,6 +293,21 @@
 	assert.Equal(t, "type", operands[1].(map[string]any)["key"])
 }
 
+func TestCorpusQueryInvalidRegexFailsAtStartup(t *testing.T) {
+	_, err := NewMapper([]config.MappingList{{
+		ID:       "corpus-test",
+		Type:     "corpus",
+		Mappings: []config.MappingRule{"textClass=[invalid#regex <> genre=broken"},
+	}})
+	assert.Error(t, err, "invalid regex should fail at NewMapper time, not silently at match time")
+	assert.Contains(t, err.Error(), "regex")
+}
+
+func TestCorpusQueryRegexCompiledOnce(t *testing.T) {
+	m := newCorpusMapper(t, "textClass=wissenschaft.*#regex <> genre=science")
+	assert.NotEmpty(t, m.compiledRegexes, "regex cache should be populated at startup")
+}
+
 func TestCorpusQueryRegexMatch(t *testing.T) {
 	m := newCorpusMapper(t, "textClass=wissenschaft.*#regex <> genre=science")
 
diff --git a/mapper/mapper.go b/mapper/mapper.go
index f72c5c5..6c21500 100644
--- a/mapper/mapper.go
+++ b/mapper/mapper.go
@@ -2,6 +2,7 @@
 
 import (
 	"fmt"
+	"regexp"
 
 	"github.com/KorAP/Koral-Mapper/config"
 	"github.com/KorAP/Koral-Mapper/parser"
@@ -42,6 +43,7 @@
 	mappingLists      map[string]*config.MappingList
 	parsedQueryRules  map[string][]*parser.MappingResult
 	parsedCorpusRules map[string][]*parser.CorpusMappingResult
+	compiledRegexes   map[string]*regexp.Regexp
 }
 
 // NewMapper creates a new Mapper instance from a list of MappingLists
@@ -50,9 +52,9 @@
 		mappingLists:      make(map[string]*config.MappingList),
 		parsedQueryRules:  make(map[string][]*parser.MappingResult),
 		parsedCorpusRules: make(map[string][]*parser.CorpusMappingResult),
+		compiledRegexes:   make(map[string]*regexp.Regexp),
 	}
 
-	// Store mapping lists by ID
 	for _, list := range lists {
 		if _, exists := m.mappingLists[list.ID]; exists {
 			return nil, fmt.Errorf("duplicate mapping list ID found: %s", list.ID)
@@ -66,6 +68,14 @@
 			if err != nil {
 				return nil, fmt.Errorf("failed to parse corpus mappings for list %s: %w", list.ID, err)
 			}
+			for _, rule := range corpusRules {
+				if err := m.precompileCorpusRegexes(rule.Upper); err != nil {
+					return nil, fmt.Errorf("invalid regex in corpus mapping list %s: %w", list.ID, err)
+				}
+				if err := m.precompileCorpusRegexes(rule.Lower); err != nil {
+					return nil, fmt.Errorf("invalid regex in corpus mapping list %s: %w", list.ID, err)
+				}
+			}
 			m.parsedCorpusRules[list.ID] = corpusRules
 		} else {
 			queryRules, err := list.ParseMappings()
@@ -79,6 +89,31 @@
 	return m, nil
 }
 
+// precompileCorpusRegexes walks a CorpusNode tree and pre-compiles any
+// regex-typed field patterns into the compiledRegexes cache.
+func (m *Mapper) precompileCorpusRegexes(node parser.CorpusNode) error {
+	switch n := node.(type) {
+	case *parser.CorpusField:
+		if n.Type == "regex" {
+			pattern := "^" + n.Value + "$"
+			if _, exists := m.compiledRegexes[pattern]; !exists {
+				re, err := regexp.Compile(pattern)
+				if err != nil {
+					return fmt.Errorf("failed to compile regex %q: %w", n.Value, err)
+				}
+				m.compiledRegexes[pattern] = re
+			}
+		}
+	case *parser.CorpusGroup:
+		for _, op := range n.Operands {
+			if err := m.precompileCorpusRegexes(op); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
 // MappingOptions contains the options for applying mappings
 type MappingOptions struct {
 	FoundryA    string