Support wildcards for layers
Change-Id: I06db6e822ca60b355cc3e86b504dba1cf3c92845
diff --git a/.gitignore b/.gitignore
index f85a248..0f23a78 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,6 @@
examples/
todo.txt
/vendor
-/termmapper
\ No newline at end of file
+/termmapper
+*.yaml
+\#*
diff --git a/config/config_test.go b/config/config_test.go
index 3df7bfa..9dc64ca 100644
--- a/config/config_test.go
+++ b/config/config_test.go
@@ -362,3 +362,215 @@
})
}
}
+
+func TestUserProvidedMappingRules(t *testing.T) {
+ // Test the exact YAML mapping rules provided by the user
+ content := `
+- id: stts-ud
+ foundryA: opennlp
+ layerA: p
+ foundryB: upos
+ layerB: p
+ mappings:
+ - "[$\\(] <> [PUNCT & PunctType=Brck]"
+ - "[$,] <> [PUNCT & PunctType=Comm]"
+ - "[$.] <> [PUNCT & PunctType=Peri]"
+ - "[ADJA] <> [ADJ]"
+ - "[ADJD] <> [ADJ & Variant=Short]"
+ - "[ADV] <> [ADV]"
+`
+ tmpfile, err := os.CreateTemp("", "user-config-*.yaml")
+ require.NoError(t, err)
+ defer os.Remove(tmpfile.Name())
+
+ _, err = tmpfile.WriteString(content)
+ require.NoError(t, err)
+ err = tmpfile.Close()
+ require.NoError(t, err)
+
+ // Test loading the configuration
+ config, err := LoadConfig(tmpfile.Name())
+ require.NoError(t, err)
+
+ // Verify the configuration loaded correctly
+ require.Len(t, config.Lists, 1)
+ list := config.Lists[0]
+ assert.Equal(t, "stts-ud", list.ID)
+ assert.Equal(t, "opennlp", list.FoundryA)
+ assert.Equal(t, "p", list.LayerA)
+ assert.Equal(t, "upos", list.FoundryB)
+ assert.Equal(t, "p", list.LayerB)
+ require.Len(t, list.Mappings, 6)
+
+ // First, test individual mappings to isolate the issue
+ t.Run("parenthesis mapping", func(t *testing.T) {
+ singleRule := &MappingList{
+ ID: "test-paren",
+ FoundryA: "opennlp",
+ LayerA: "p",
+ FoundryB: "upos",
+ LayerB: "p",
+ Mappings: []MappingRule{"[$\\(] <> [PUNCT & PunctType=Brck]"},
+ }
+ results, err := singleRule.ParseMappings()
+ require.NoError(t, err)
+ require.Len(t, results, 1)
+
+ upperTerm := results[0].Upper.Wrap.(*ast.Term)
+ assert.Equal(t, "$(", upperTerm.Key)
+ })
+
+ t.Run("comma mapping", func(t *testing.T) {
+ singleRule := &MappingList{
+ ID: "test-comma",
+ FoundryA: "opennlp",
+ LayerA: "p",
+ FoundryB: "upos",
+ LayerB: "p",
+ Mappings: []MappingRule{"[$,] <> [PUNCT & PunctType=Comm]"},
+ }
+ results, err := singleRule.ParseMappings()
+ require.NoError(t, err)
+ require.Len(t, results, 1)
+
+ upperTerm := results[0].Upper.Wrap.(*ast.Term)
+ assert.Equal(t, "$,", upperTerm.Key)
+ })
+
+ t.Run("period mapping", func(t *testing.T) {
+ singleRule := &MappingList{
+ ID: "test-period",
+ FoundryA: "opennlp",
+ LayerA: "p",
+ FoundryB: "upos",
+ LayerB: "p",
+ Mappings: []MappingRule{"[$.] <> [PUNCT & PunctType=Peri]"},
+ }
+ results, err := singleRule.ParseMappings()
+ require.NoError(t, err)
+ require.Len(t, results, 1)
+
+ upperTerm := results[0].Upper.Wrap.(*ast.Term)
+ assert.Equal(t, "$.", upperTerm.Key)
+ })
+
+ // Test that all mapping rules can be parsed successfully
+ results, err := list.ParseMappings()
+ require.NoError(t, err)
+ require.Len(t, results, 6)
+
+ // Verify specific parsing of the special character mapping
+ // The first mapping "[$\\(] <> [PUNCT & PunctType=Brck]" should parse correctly
+ firstMapping := results[0]
+ require.NotNil(t, firstMapping.Upper)
+ upperTerm := firstMapping.Upper.Wrap.(*ast.Term)
+ assert.Equal(t, "$(", upperTerm.Key) // The actual parsed key should be "$("
+ assert.Equal(t, "opennlp", upperTerm.Foundry)
+ assert.Equal(t, "p", upperTerm.Layer)
+
+ require.NotNil(t, firstMapping.Lower)
+ lowerGroup := firstMapping.Lower.Wrap.(*ast.TermGroup)
+ require.Len(t, lowerGroup.Operands, 2)
+ assert.Equal(t, ast.AndRelation, lowerGroup.Relation)
+
+ // Check the PUNCT term
+ punctTerm := lowerGroup.Operands[0].(*ast.Term)
+ assert.Equal(t, "PUNCT", punctTerm.Key)
+ assert.Equal(t, "upos", punctTerm.Foundry)
+ assert.Equal(t, "p", punctTerm.Layer)
+
+ // Check the PunctType term
+ punctTypeTerm := lowerGroup.Operands[1].(*ast.Term)
+ assert.Equal(t, "PunctType", punctTypeTerm.Layer)
+ assert.Equal(t, "Brck", punctTypeTerm.Key)
+ assert.Equal(t, "upos", punctTypeTerm.Foundry)
+
+ // Verify the comma mapping as well
+ secondMapping := results[1]
+ upperTerm2 := secondMapping.Upper.Wrap.(*ast.Term)
+ assert.Equal(t, "$,", upperTerm2.Key)
+
+ // Verify the period mapping
+ thirdMapping := results[2]
+ upperTerm3 := thirdMapping.Upper.Wrap.(*ast.Term)
+ assert.Equal(t, "$.", upperTerm3.Key)
+
+ // Verify basic mappings without special characters
+ fourthMapping := results[3]
+ upperTerm4 := fourthMapping.Upper.Wrap.(*ast.Term)
+ assert.Equal(t, "ADJA", upperTerm4.Key)
+ lowerTerm4 := fourthMapping.Lower.Wrap.(*ast.Term)
+ assert.Equal(t, "ADJ", lowerTerm4.Key)
+}
+
+func TestExistingUposYaml(t *testing.T) {
+ // Test that the existing upos.yaml file can be parsed correctly
+ config, err := LoadConfig("../upos.yaml")
+ require.NoError(t, err)
+
+ // Verify the configuration loaded correctly
+ require.Len(t, config.Lists, 1)
+ list := config.Lists[0]
+ assert.Equal(t, "stts-ud", list.ID)
+ assert.Equal(t, "opennlp", list.FoundryA)
+ assert.Equal(t, "p", list.LayerA)
+ assert.Equal(t, "upos", list.FoundryB)
+ assert.Equal(t, "p", list.LayerB)
+ require.Len(t, list.Mappings, 54) // Should have 54 mapping rules
+
+ // Test that all mapping rules can be parsed successfully
+ results, err := list.ParseMappings()
+ require.NoError(t, err)
+ require.Len(t, results, 54)
+
+ // Test a few specific mappings to ensure they parse correctly
+
+ // Test the special character mappings
+ firstMapping := results[0] // "[$\\(] <> [PUNCT & PunctType=Brck]"
+ upperTerm := firstMapping.Upper.Wrap.(*ast.Term)
+ assert.Equal(t, "$(", upperTerm.Key)
+ assert.Equal(t, "opennlp", upperTerm.Foundry)
+ assert.Equal(t, "p", upperTerm.Layer)
+
+ lowerGroup := firstMapping.Lower.Wrap.(*ast.TermGroup)
+ require.Len(t, lowerGroup.Operands, 2)
+ assert.Equal(t, ast.AndRelation, lowerGroup.Relation)
+
+ punctTerm := lowerGroup.Operands[0].(*ast.Term)
+ assert.Equal(t, "PUNCT", punctTerm.Key)
+ assert.Equal(t, "upos", punctTerm.Foundry)
+ assert.Equal(t, "p", punctTerm.Layer)
+
+ punctTypeTerm := lowerGroup.Operands[1].(*ast.Term)
+ assert.Equal(t, "PunctType", punctTypeTerm.Layer)
+ assert.Equal(t, "Brck", punctTypeTerm.Key)
+ assert.Equal(t, "upos", punctTypeTerm.Foundry)
+
+ // Test a complex mapping with multiple attributes
+ // "[PIDAT] <> [DET & AdjType=Pdt & (PronType=Ind | PronType=Neg | PronType=Tot)]"
+ pidatMapping := results[24] // This should be the PIDAT mapping
+ pidatUpper := pidatMapping.Upper.Wrap.(*ast.Term)
+ assert.Equal(t, "PIDAT", pidatUpper.Key)
+
+ pidatLower := pidatMapping.Lower.Wrap.(*ast.TermGroup)
+ assert.Equal(t, ast.AndRelation, pidatLower.Relation)
+ require.Len(t, pidatLower.Operands, 3) // DET, AdjType=Pdt, and the parenthesized group
+
+ detTerm := pidatLower.Operands[0].(*ast.Term)
+ assert.Equal(t, "DET", detTerm.Key)
+
+ adjTypeTerm := pidatLower.Operands[1].(*ast.Term)
+ assert.Equal(t, "AdjType", adjTypeTerm.Layer)
+ assert.Equal(t, "Pdt", adjTypeTerm.Key)
+
+ // The third operand should be a nested TermGroup with OR relation
+ nestedGroup := pidatLower.Operands[2].(*ast.TermGroup)
+ assert.Equal(t, ast.OrRelation, nestedGroup.Relation)
+ require.Len(t, nestedGroup.Operands, 3) // PronType=Ind, PronType=Neg, PronType=Tot
+
+ for i, expectedValue := range []string{"Ind", "Neg", "Tot"} {
+ pronTypeTerm := nestedGroup.Operands[i].(*ast.Term)
+ assert.Equal(t, "PronType", pronTypeTerm.Layer)
+ assert.Equal(t, expectedValue, pronTypeTerm.Key)
+ }
+}
diff --git a/parser/grammar_parser.go b/parser/grammar_parser.go
index 81ebe62..8ee167f 100644
--- a/parser/grammar_parser.go
+++ b/parser/grammar_parser.go
@@ -61,10 +61,11 @@
// SimpleTerm represents any valid term form
type SimpleTerm struct {
- WithFoundryLayer *FoundryLayerTerm `parser:"@@"`
- WithFoundryKey *FoundryKeyTerm `parser:"| @@"`
- WithLayer *LayerTerm `parser:"| @@"`
- SimpleKey *KeyTerm `parser:"| @@"`
+ WithFoundryLayer *FoundryLayerTerm `parser:"@@"`
+ WithFoundryWildcard *FoundryWildcardTerm `parser:"| @@"`
+ WithFoundryKey *FoundryKeyTerm `parser:"| @@"`
+ WithLayer *LayerTerm `parser:"| @@"`
+ SimpleKey *KeyTerm `parser:"| @@"`
}
// FoundryLayerTerm represents foundry/layer=key:value
@@ -75,30 +76,42 @@
Value string `parser:"(':' @Ident)?"`
}
+// FoundryWildcardTerm represents foundry/*=key (wildcard layer)
+type FoundryWildcardTerm struct {
+ Foundry string `parser:"@Ident '/' '*' '='"`
+ Key string `parser:"@Ident"`
+}
+
// FoundryKeyTerm represents foundry/key
type FoundryKeyTerm struct {
Foundry string `parser:"@Ident '/'"`
Key string `parser:"@Ident"`
}
-// LayerTerm represents layer=key:value
+// LayerTerm represents layer=key:value (only when no foundry is present)
type LayerTerm struct {
Layer string `parser:"@Ident '='"`
Key string `parser:"@Ident"`
Value string `parser:"(':' @Ident)?"`
}
-// KeyTerm represents key:value
+// KeyTerm represents key:value or key=value
type KeyTerm struct {
Key string `parser:"@Ident"`
- Value string `parser:"(':' @Ident)?"`
+ Value string `parser:"((':' | '=') @Ident)?"`
+}
+
+// EscapedPunct represents an escaped punctuation character like \(
+type EscapedPunct struct {
+ Prefix string `parser:"@Ident"`
+ Punct string `parser:"@Punct"`
}
// NewGrammarParser creates a new grammar parser with optional default foundry and layer
func NewGrammarParser(defaultFoundry, defaultLayer string) (*GrammarParser, error) {
lex := lexer.MustSimple([]lexer.SimpleRule{
- {Name: "Ident", Pattern: `(?:[a-zA-Z$]|\\.)(?:[a-zA-Z0-9_$]|\\.)*`},
- {Name: "Punct", Pattern: `[\[\]()&\|=:/]|<>`},
+ {Name: "Ident", Pattern: `(?:[a-zA-Z$,.]|\\.)(?:[a-zA-Z0-9_$,.]|\\.)*`},
+ {Name: "Punct", Pattern: `[\[\]()&\|=:/\*]|<>`},
{Name: "Whitespace", Pattern: `\s+`},
})
@@ -140,8 +153,24 @@
runes := []rune(input)
for i, r := range runes {
if (r == '(' || r == ')') && (i == 0 || runes[i-1] != '\\') {
- // Only add spaces if the parenthesis is not escaped
- result = append(result, ' ', r, ' ')
+ // Only add spaces if the parenthesis is not escaped and not part of an identifier
+ // Check if this parenthesis is inside brackets (part of an identifier)
+ insideBrackets := false
+ bracketDepth := 0
+ for j := 0; j < i; j++ {
+ if runes[j] == '[' {
+ bracketDepth++
+ } else if runes[j] == ']' {
+ bracketDepth--
+ }
+ }
+ insideBrackets = bracketDepth > 0
+
+ if !insideBrackets {
+ result = append(result, ' ', r, ' ')
+ } else {
+ result = append(result, r)
+ }
} else {
result = append(result, r)
}
@@ -180,8 +209,24 @@
runes := []rune(input)
for i, r := range runes {
if (r == '(' || r == ')') && (i == 0 || runes[i-1] != '\\') {
- // Only add spaces if the parenthesis is not escaped
- result = append(result, ' ', r, ' ')
+ // Only add spaces if the parenthesis is not escaped and not part of an identifier
+ // Check if this parenthesis is inside brackets (part of an identifier)
+ insideBrackets := false
+ bracketDepth := 0
+ for j := 0; j < i; j++ {
+ if runes[j] == '[' {
+ bracketDepth++
+ } else if runes[j] == ']' {
+ bracketDepth--
+ }
+ }
+ insideBrackets = bracketDepth > 0
+
+ if !insideBrackets {
+ result = append(result, ' ', r, ' ')
+ } else {
+ result = append(result, r)
+ }
} else {
result = append(result, r)
}
@@ -340,13 +385,34 @@
layer = unescapeString(term.WithFoundryLayer.Layer)
key = unescapeString(term.WithFoundryLayer.Key)
value = unescapeString(term.WithFoundryLayer.Value)
+ case term.WithFoundryWildcard != nil:
+ foundry = unescapeString(term.WithFoundryWildcard.Foundry)
+ key = unescapeString(term.WithFoundryWildcard.Key)
case term.WithFoundryKey != nil:
foundry = unescapeString(term.WithFoundryKey.Foundry)
key = unescapeString(term.WithFoundryKey.Key)
case term.WithLayer != nil:
- layer = unescapeString(term.WithLayer.Layer)
- key = unescapeString(term.WithLayer.Key)
- value = unescapeString(term.WithLayer.Value)
+ // Special case: if LayerTerm was parsed but the layer doesn't match the default layer,
+ // treat it as a key=value pattern instead
+ parsedLayer := unescapeString(term.WithLayer.Layer)
+ parsedKey := unescapeString(term.WithLayer.Key)
+ parsedValue := unescapeString(term.WithLayer.Value)
+
+ if p.defaultLayer != "" && parsedLayer == p.defaultLayer {
+ // This is a genuine layer=key pattern when the layer matches the default
+ layer = parsedLayer
+ key = parsedKey
+ value = parsedValue
+ } else if p.defaultLayer != "" && parsedLayer != p.defaultLayer {
+ // This should be treated as key=value pattern when there's a default layer but it doesn't match
+ key = parsedLayer
+ value = parsedKey
+ } else {
+ // No default layer context, treat as genuine layer=key pattern
+ layer = parsedLayer
+ key = parsedKey
+ value = parsedValue
+ }
case term.SimpleKey != nil:
key = unescapeString(term.SimpleKey.Key)
value = unescapeString(term.SimpleKey.Value)
diff --git a/parser/grammar_parser_test.go b/parser/grammar_parser_test.go
index 189ff68..5c0cc56 100644
--- a/parser/grammar_parser_test.go
+++ b/parser/grammar_parser_test.go
@@ -89,6 +89,18 @@
},
},
},
+ {
+ name: "Foundry wildcard key",
+ input: "[opennlp/*=PIDAT]",
+ defaultFoundry: "opennlp",
+ defaultLayer: "p",
+ expected: &SimpleTerm{
+ WithFoundryWildcard: &FoundryWildcardTerm{
+ Foundry: "opennlp",
+ Key: "PIDAT",
+ },
+ },
+ },
}
for _, tt := range tests {
@@ -228,6 +240,20 @@
},
},
},
+ {
+ name: "Wildcard pattern",
+ input: "[opennlp/*=PIDAT]",
+ defaultFoundry: "opennlp",
+ defaultLayer: "p",
+ expected: &ast.Token{
+ Wrap: &ast.Term{
+ Foundry: "opennlp",
+ Key: "PIDAT",
+ Layer: "p",
+ Match: ast.MatchEqual,
+ },
+ },
+ },
}
for _, tt := range tests {
@@ -363,7 +389,7 @@
assert.Error(t, err)
return
}
- assert.NoError(t, err)
+ assert.NoError(t, err, "Input: %s", tt.input)
assert.Equal(t, tt.expected, result)
})
}