Precompile corpus regex to prevent per-request compilation
Change-Id: Ib78258602ceb4ae5ac6daa84e8ea08965157adfd
diff --git a/mapper/corpus.go b/mapper/corpus.go
index 1a5db5b..40446de 100644
--- a/mapper/corpus.go
+++ b/mapper/corpus.go
@@ -2,7 +2,6 @@
import (
"maps"
- "regexp"
"slices"
"github.com/KorAP/Koral-Mapper/ast"
@@ -68,7 +67,7 @@
pattern, replacement = rule.Lower, rule.Upper
}
- if matchCorpusNode(pattern, node) {
+ if m.matchCorpusNode(pattern, node) {
// AND subset match: node has more operands than pattern
if pg, ok := pattern.(*parser.CorpusGroup); ok && pg.Operation == "and" {
operandsRaw, _ := node["operands"].([]any)
@@ -126,7 +125,7 @@
if !ok {
continue
}
- if matchCorpusNode(patOp, docOp) {
+ if m.matchCorpusNode(patOp, docOp) {
used[j] = true
break
}
@@ -167,16 +166,16 @@
// For CorpusField patterns, the node must be a koral:doc/koral:field.
// For CorpusGroup patterns, the node must be a koral:docGroup/koral:fieldGroup
// with matching operation and exactly matching operands (commutative).
-func matchCorpusNode(pattern parser.CorpusNode, node map[string]any) bool {
+func (m *Mapper) matchCorpusNode(pattern parser.CorpusNode, node map[string]any) bool {
switch p := pattern.(type) {
case *parser.CorpusField:
atType, _ := node["@type"].(string)
if atType != "koral:doc" && atType != "koral:field" {
return false
}
- return matchCorpusField(p, node)
+ return m.matchCorpusField(p, node)
case *parser.CorpusGroup:
- return matchCorpusGroupNode(p, node)
+ return m.matchCorpusGroupNode(p, node)
}
return false
}
@@ -190,14 +189,14 @@
// AND patterns: the node must be a docGroup/fieldGroup with AND operation
// and all pattern operands must be found (subset matching — the node may
// have additional operands beyond those in the pattern).
-func matchCorpusGroupNode(pattern *parser.CorpusGroup, node map[string]any) bool {
+func (m *Mapper) matchCorpusGroupNode(pattern *parser.CorpusGroup, node map[string]any) bool {
atType, _ := node["@type"].(string)
if pattern.Operation == "or" {
// Leaf nodes: any-operand matching
if atType == "koral:doc" || atType == "koral:field" {
for _, op := range pattern.Operands {
- if matchCorpusNode(op, node) {
+ if m.matchCorpusNode(op, node) {
return true
}
}
@@ -211,7 +210,7 @@
if operation != "operation:or" {
return false
}
- return matchGroupOperands(pattern.Operands, node, true)
+ return m.matchGroupOperands(pattern.Operands, node, true)
}
// AND patterns: subset matching
@@ -222,14 +221,14 @@
if operation != "operation:and" {
return false
}
- return matchGroupOperands(pattern.Operands, node, false)
+ return m.matchGroupOperands(pattern.Operands, node, false)
}
// matchGroupOperands checks if a docGroup's operands match a pattern's
// operands using commutative set matching. When exactCount is true, the
// operand counts must be equal; otherwise subset matching is used (the
// node may have more operands than the pattern).
-func matchGroupOperands(patternOps []parser.CorpusNode, node map[string]any, exactCount bool) bool {
+func (m *Mapper) matchGroupOperands(patternOps []parser.CorpusNode, node map[string]any, exactCount bool) bool {
operandsRaw, ok := node["operands"].([]any)
if !ok {
return false
@@ -255,7 +254,7 @@
if !ok {
continue
}
- if matchCorpusNode(patOp, docOp) {
+ if m.matchCorpusNode(patOp, docOp) {
used[j] = true
found = true
break
@@ -269,7 +268,7 @@
}
// matchCorpusField checks if a koral:doc JSON node matches a CorpusField pattern.
-func matchCorpusField(pattern *parser.CorpusField, doc map[string]any) bool {
+func (m *Mapper) matchCorpusField(pattern *parser.CorpusField, doc map[string]any) bool {
docKey, _ := doc["key"].(string)
if docKey != pattern.Key {
return false
@@ -277,11 +276,8 @@
docValue, _ := doc["value"].(string)
if pattern.Type == "regex" {
- re, err := regexp.Compile("^" + pattern.Value + "$")
- if err != nil {
- return false
- }
- if !re.MatchString(docValue) {
+ re := m.compiledRegexes["^"+pattern.Value+"$"]
+ if re == nil || !re.MatchString(docValue) {
return false
}
} else if docValue != pattern.Value {
@@ -501,7 +497,7 @@
pattern, replacement = rule.Lower, rule.Upper
}
- if !matchCorpusFieldPattern(pattern, pseudoDoc) {
+ if !m.matchCorpusFieldPattern(pattern, pseudoDoc) {
continue
}
@@ -528,7 +524,7 @@
if !patternNeedsAggregateMatching(pattern) {
continue
}
- if !matchCorpusPatternAgainstValues(pattern, values) {
+ if !m.matchCorpusPatternAgainstValues(pattern, values) {
continue
}
@@ -567,13 +563,13 @@
return values
}
-func matchCorpusPatternAgainstValues(pattern parser.CorpusNode, values map[string][]string) bool {
+func (m *Mapper) matchCorpusPatternAgainstValues(pattern parser.CorpusNode, values map[string][]string) bool {
switch p := pattern.(type) {
case *parser.CorpusField:
if p.Key == "" {
for key, keyValues := range values {
for _, value := range keyValues {
- if matchCorpusField(p, map[string]any{"key": key, "value": value}) {
+ if m.matchCorpusField(p, map[string]any{"key": key, "value": value}) {
return true
}
}
@@ -581,7 +577,7 @@
return false
}
for _, value := range values[p.Key] {
- if matchCorpusField(p, map[string]any{"key": p.Key, "value": value}) {
+ if m.matchCorpusField(p, map[string]any{"key": p.Key, "value": value}) {
return true
}
}
@@ -590,7 +586,7 @@
case *parser.CorpusGroup:
if p.Operation == "or" {
for _, op := range p.Operands {
- if matchCorpusPatternAgainstValues(op, values) {
+ if m.matchCorpusPatternAgainstValues(op, values) {
return true
}
}
@@ -598,7 +594,7 @@
}
for _, op := range p.Operands {
- if !matchCorpusPatternAgainstValues(op, values) {
+ if !m.matchCorpusPatternAgainstValues(op, values) {
return false
}
}
@@ -626,14 +622,14 @@
// matchCorpusFieldPattern checks if a single response field matches a pattern.
// Field patterns match directly. OR group patterns match if any operand matches.
// AND group patterns cannot match a single field.
-func matchCorpusFieldPattern(pattern parser.CorpusNode, doc map[string]any) bool {
+func (m *Mapper) matchCorpusFieldPattern(pattern parser.CorpusNode, doc map[string]any) bool {
switch p := pattern.(type) {
case *parser.CorpusField:
- return matchCorpusField(p, doc)
+ return m.matchCorpusField(p, doc)
case *parser.CorpusGroup:
if p.Operation == "or" {
for _, op := range p.Operands {
- if matchCorpusFieldPattern(op, doc) {
+ if m.matchCorpusFieldPattern(op, doc) {
return true
}
}
diff --git a/mapper/corpus_test.go b/mapper/corpus_test.go
index 8ab9be2..cc2381b 100644
--- a/mapper/corpus_test.go
+++ b/mapper/corpus_test.go
@@ -293,6 +293,21 @@
assert.Equal(t, "type", operands[1].(map[string]any)["key"])
}
+func TestCorpusQueryInvalidRegexFailsAtStartup(t *testing.T) {
+ _, err := NewMapper([]config.MappingList{{
+ ID: "corpus-test",
+ Type: "corpus",
+ Mappings: []config.MappingRule{"textClass=[invalid#regex <> genre=broken"},
+ }})
+ assert.Error(t, err, "invalid regex should fail at NewMapper time, not silently at match time")
+ assert.Contains(t, err.Error(), "regex")
+}
+
+func TestCorpusQueryRegexCompiledOnce(t *testing.T) {
+ m := newCorpusMapper(t, "textClass=wissenschaft.*#regex <> genre=science")
+ assert.NotEmpty(t, m.compiledRegexes, "regex cache should be populated at startup")
+}
+
func TestCorpusQueryRegexMatch(t *testing.T) {
m := newCorpusMapper(t, "textClass=wissenschaft.*#regex <> genre=science")
diff --git a/mapper/mapper.go b/mapper/mapper.go
index f72c5c5..6c21500 100644
--- a/mapper/mapper.go
+++ b/mapper/mapper.go
@@ -2,6 +2,7 @@
import (
"fmt"
+ "regexp"
"github.com/KorAP/Koral-Mapper/config"
"github.com/KorAP/Koral-Mapper/parser"
@@ -42,6 +43,7 @@
mappingLists map[string]*config.MappingList
parsedQueryRules map[string][]*parser.MappingResult
parsedCorpusRules map[string][]*parser.CorpusMappingResult
+ compiledRegexes map[string]*regexp.Regexp
}
// NewMapper creates a new Mapper instance from a list of MappingLists
@@ -50,9 +52,9 @@
mappingLists: make(map[string]*config.MappingList),
parsedQueryRules: make(map[string][]*parser.MappingResult),
parsedCorpusRules: make(map[string][]*parser.CorpusMappingResult),
+ compiledRegexes: make(map[string]*regexp.Regexp),
}
- // Store mapping lists by ID
for _, list := range lists {
if _, exists := m.mappingLists[list.ID]; exists {
return nil, fmt.Errorf("duplicate mapping list ID found: %s", list.ID)
@@ -66,6 +68,14 @@
if err != nil {
return nil, fmt.Errorf("failed to parse corpus mappings for list %s: %w", list.ID, err)
}
+ for _, rule := range corpusRules {
+ if err := m.precompileCorpusRegexes(rule.Upper); err != nil {
+ return nil, fmt.Errorf("invalid regex in corpus mapping list %s: %w", list.ID, err)
+ }
+ if err := m.precompileCorpusRegexes(rule.Lower); err != nil {
+ return nil, fmt.Errorf("invalid regex in corpus mapping list %s: %w", list.ID, err)
+ }
+ }
m.parsedCorpusRules[list.ID] = corpusRules
} else {
queryRules, err := list.ParseMappings()
@@ -79,6 +89,31 @@
return m, nil
}
+// precompileCorpusRegexes walks a CorpusNode tree and pre-compiles any
+// regex-typed field patterns into the compiledRegexes cache.
+func (m *Mapper) precompileCorpusRegexes(node parser.CorpusNode) error {
+ switch n := node.(type) {
+ case *parser.CorpusField:
+ if n.Type == "regex" {
+ pattern := "^" + n.Value + "$"
+ if _, exists := m.compiledRegexes[pattern]; !exists {
+ re, err := regexp.Compile(pattern)
+ if err != nil {
+ return fmt.Errorf("failed to compile regex %q: %w", n.Value, err)
+ }
+ m.compiledRegexes[pattern] = re
+ }
+ }
+ case *parser.CorpusGroup:
+ for _, op := range n.Operands {
+ if err := m.precompileCorpusRegexes(op); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
// MappingOptions contains the options for applying mappings
type MappingOptions struct {
FoundryA string