Add parser for mapping grammer
diff --git a/pkg/parser/grammar_parser.go b/pkg/parser/grammar_parser.go
new file mode 100644
index 0000000..18cd5c7
--- /dev/null
+++ b/pkg/parser/grammar_parser.go
@@ -0,0 +1,270 @@
+package parser
+
+import (
+ "fmt"
+ "strings"
+
+ "github.com/KorAP/KoralPipe-TermMapper2/pkg/ast"
+ "github.com/alecthomas/participle/v2"
+ "github.com/alecthomas/participle/v2/lexer"
+)
+
+// GrammarParser parses a simple grammar into AST nodes
+type GrammarParser struct {
+ defaultFoundry string
+ defaultLayer string
+ parser *participle.Parser[Grammar]
+}
+
+// Grammar represents the root of our grammar
+type Grammar struct {
+ Token *TokenExpr `@@`
+}
+
+// TokenExpr represents a token expression in square brackets
+type TokenExpr struct {
+ Expr *Expr `"[" @@ "]"`
+}
+
+// Expr represents a sequence of terms and operators
+type Expr struct {
+ First *Term `@@`
+ Rest []*Op `@@*`
+}
+
+type Op struct {
+ Operator string `@("&" | "|")`
+ Term *Term `@@`
+}
+
+// Term represents either a simple term or a parenthesized expression
+type Term struct {
+ Simple *SimpleTerm `@@`
+ Paren *ParenExpr `| @@`
+}
+
+type ParenExpr struct {
+ Expr *Expr `"(" @@ ")"`
+}
+
+// SimpleTerm represents any valid term form
+type SimpleTerm struct {
+ WithFoundryLayer *FoundryLayerTerm `@@`
+ WithFoundryKey *FoundryKeyTerm `| @@`
+ WithLayer *LayerTerm `| @@`
+ SimpleKey *KeyTerm `| @@`
+}
+
+// FoundryLayerTerm represents foundry/layer=key:value
+type FoundryLayerTerm struct {
+ Foundry string `@Ident "/"`
+ Layer string `@Ident "="`
+ Key string `@Ident`
+ Value string `(":" @Ident)?`
+}
+
+// FoundryKeyTerm represents foundry/key
+type FoundryKeyTerm struct {
+ Foundry string `@Ident "/"`
+ Key string `@Ident`
+}
+
+// LayerTerm represents layer=key:value
+type LayerTerm struct {
+ Layer string `@Ident "="`
+ Key string `@Ident`
+ Value string `(":" @Ident)?`
+}
+
+// KeyTerm represents key:value
+type KeyTerm struct {
+ Key string `@Ident`
+ Value string `(":" @Ident)?`
+}
+
+// NewGrammarParser creates a new grammar parser with optional default foundry and layer
+func NewGrammarParser(defaultFoundry, defaultLayer string) (*GrammarParser, error) {
+ lex := lexer.MustSimple([]lexer.SimpleRule{
+ {Name: "Ident", Pattern: `[a-zA-Z][a-zA-Z0-9_]*`},
+ {Name: "Punct", Pattern: `[\[\]()&\|=:/]`},
+ {Name: "Whitespace", Pattern: `\s+`},
+ })
+
+ parser, err := participle.Build[Grammar](
+ participle.Lexer(lex),
+ participle.UseLookahead(2),
+ participle.Elide("Whitespace"),
+ )
+ if err != nil {
+ return nil, fmt.Errorf("failed to build parser: %w", err)
+ }
+
+ return &GrammarParser{
+ defaultFoundry: defaultFoundry,
+ defaultLayer: defaultLayer,
+ parser: parser,
+ }, nil
+}
+
+// Parse parses a grammar string into an AST node
+func (p *GrammarParser) Parse(input string) (ast.Node, error) {
+ // Remove extra spaces around operators to help the parser
+ input = strings.ReplaceAll(input, " & ", "&")
+ input = strings.ReplaceAll(input, " | ", "|")
+
+ // Add spaces around parentheses to help the parser
+ input = strings.ReplaceAll(input, "(", " ( ")
+ input = strings.ReplaceAll(input, ")", " ) ")
+
+ // Remove any extra spaces
+ input = strings.TrimSpace(input)
+
+ grammar, err := p.parser.ParseString("", input)
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse grammar: %w", err)
+ }
+
+ wrap, err := p.parseExpr(grammar.Token.Expr)
+ if err != nil {
+ return nil, err
+ }
+ return &ast.Token{Wrap: wrap}, nil
+}
+
+// parseExpr builds the AST from the parsed Expr
+func (p *GrammarParser) parseExpr(expr *Expr) (ast.Node, error) {
+ var operands []ast.Node
+ var operators []string
+
+ // Parse the first term
+ first, err := p.parseTerm(expr.First)
+ if err != nil {
+ return nil, err
+ }
+ operands = append(operands, first)
+
+ // Parse the rest
+ for _, op := range expr.Rest {
+ node, err := p.parseTerm(op.Term)
+ if err != nil {
+ return nil, err
+ }
+ operands = append(operands, node)
+ operators = append(operators, op.Operator)
+ }
+
+ // If only one operand, return it
+ if len(operands) == 1 {
+ return operands[0], nil
+ }
+
+ // Group operands by operator precedence (left-to-right, no precedence between & and |)
+ // We'll group by runs of the same operator
+ var groupOperands []ast.Node
+ var currentOp string
+ var currentGroup []ast.Node
+ for i, op := range operators {
+ if i == 0 {
+ currentOp = op
+ currentGroup = append(currentGroup, operands[i])
+ }
+ if op == currentOp {
+ currentGroup = append(currentGroup, operands[i+1])
+ } else {
+ groupOperands = append(groupOperands, &ast.TermGroup{
+ Operands: append([]ast.Node{}, currentGroup...),
+ Relation: toRelation(currentOp),
+ })
+ currentOp = op
+ currentGroup = []ast.Node{operands[i+1]}
+ }
+ }
+ if len(currentGroup) > 0 {
+ groupOperands = append(groupOperands, &ast.TermGroup{
+ Operands: append([]ast.Node{}, currentGroup...),
+ Relation: toRelation(currentOp),
+ })
+ }
+ if len(groupOperands) == 1 {
+ return groupOperands[0], nil
+ }
+ // If mixed operators, nest them left-to-right
+ result := groupOperands[0]
+ for i := 1; i < len(groupOperands); i++ {
+ result = &ast.TermGroup{
+ Operands: []ast.Node{result, groupOperands[i]},
+ Relation: toRelation(operators[0]),
+ }
+ }
+ return result, nil
+}
+
+// parseTerm converts a Term into an AST node
+func (p *GrammarParser) parseTerm(term *Term) (ast.Node, error) {
+ if term.Simple != nil {
+ return p.parseSimpleTerm(term.Simple)
+ }
+ if term.Paren != nil {
+ return p.parseExpr(term.Paren.Expr)
+ }
+ return nil, fmt.Errorf("invalid term: neither simple nor parenthesized")
+}
+
+func toRelation(op string) ast.RelationType {
+ if op == "|" {
+ return ast.OrRelation
+ }
+ return ast.AndRelation
+}
+
+// parseSimpleTerm converts a SimpleTerm into an AST Term node
+func (p *GrammarParser) parseSimpleTerm(term *SimpleTerm) (ast.Node, error) {
+ var foundry, layer, key, value string
+
+ switch {
+ case term.WithFoundryLayer != nil:
+ foundry = term.WithFoundryLayer.Foundry
+ layer = term.WithFoundryLayer.Layer
+ key = term.WithFoundryLayer.Key
+ value = term.WithFoundryLayer.Value
+ case term.WithFoundryKey != nil:
+ foundry = term.WithFoundryKey.Foundry
+ key = term.WithFoundryKey.Key
+ case term.WithLayer != nil:
+ layer = term.WithLayer.Layer
+ key = term.WithLayer.Key
+ value = term.WithLayer.Value
+ case term.SimpleKey != nil:
+ key = term.SimpleKey.Key
+ value = term.SimpleKey.Value
+ default:
+ return nil, fmt.Errorf("invalid term: no valid form found")
+ }
+
+ if foundry == "" {
+ foundry = p.defaultFoundry
+ }
+ if layer == "" {
+ layer = p.defaultLayer
+ }
+
+ return &ast.Term{
+ Foundry: foundry,
+ Key: key,
+ Layer: layer,
+ Match: ast.MatchEqual,
+ Value: value,
+ }, nil
+}
+
+// indexRune returns the index of the first instance of any of the runes in chars in s, or -1 if no rune is present.
+func indexRune(s string, chars ...rune) int {
+ for i, c := range s {
+ for _, ch := range chars {
+ if c == ch {
+ return i
+ }
+ }
+ }
+ return -1
+}
diff --git a/pkg/parser/grammar_parser_test.go b/pkg/parser/grammar_parser_test.go
new file mode 100644
index 0000000..62d16c0
--- /dev/null
+++ b/pkg/parser/grammar_parser_test.go
@@ -0,0 +1,218 @@
+package parser
+
+import (
+ "testing"
+
+ "github.com/KorAP/KoralPipe-TermMapper2/pkg/ast"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func TestGrammarParserSimpleTerm(t *testing.T) {
+ tests := []struct {
+ name string
+ input string
+ defaultFoundry string
+ defaultLayer string
+ expected *SimpleTerm
+ expectError bool
+ }{
+ {
+ name: "Foundry layer key value",
+ input: "[opennlp/p=PIDAT:new]",
+ defaultFoundry: "opennlp",
+ defaultLayer: "p",
+ expected: &SimpleTerm{
+ WithFoundryLayer: &FoundryLayerTerm{
+ Foundry: "opennlp",
+ Layer: "p",
+ Key: "PIDAT",
+ Value: "new",
+ },
+ },
+ },
+ {
+ name: "Foundry layer key",
+ input: "[opennlp/p=PIDAT]",
+ defaultFoundry: "opennlp",
+ defaultLayer: "p",
+ expected: &SimpleTerm{
+ WithFoundryLayer: &FoundryLayerTerm{
+ Foundry: "opennlp",
+ Layer: "p",
+ Key: "PIDAT",
+ },
+ },
+ },
+ {
+ name: "Layer key",
+ input: "[p=PIDAT]",
+ defaultFoundry: "opennlp",
+ defaultLayer: "p",
+ expected: &SimpleTerm{
+ WithLayer: &LayerTerm{
+ Layer: "p",
+ Key: "PIDAT",
+ },
+ },
+ },
+ {
+ name: "Simple key",
+ input: "[PIDAT]",
+ defaultFoundry: "opennlp",
+ defaultLayer: "p",
+ expected: &SimpleTerm{
+ SimpleKey: &KeyTerm{
+ Key: "PIDAT",
+ },
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ parser, err := NewGrammarParser(tt.defaultFoundry, tt.defaultLayer)
+ require.NoError(t, err)
+
+ grammar, err := parser.parser.ParseString("", tt.input)
+ if tt.expectError {
+ assert.Error(t, err)
+ return
+ }
+ require.NoError(t, err)
+ assert.Equal(t, tt.expected, grammar.Token.Expr.First.Simple)
+ })
+ }
+}
+
+func TestGrammarParser(t *testing.T) {
+ tests := []struct {
+ name string
+ input string
+ defaultFoundry string
+ defaultLayer string
+ expected ast.Node
+ expectError bool
+ }{
+ {
+ name: "Simple term with foundry and layer",
+ input: "[opennlp/p=PIDAT]",
+ defaultFoundry: "opennlp",
+ defaultLayer: "p",
+ expected: &ast.Token{
+ Wrap: &ast.Term{
+ Foundry: "opennlp",
+ Key: "PIDAT",
+ Layer: "p",
+ Match: ast.MatchEqual,
+ },
+ },
+ },
+ {
+ name: "Term group with and relation",
+ input: "[opennlp/p=PIDAT & opennlp/p=AdjType:Pdt]",
+ defaultFoundry: "opennlp",
+ defaultLayer: "p",
+ expected: &ast.Token{
+ Wrap: &ast.TermGroup{
+ Operands: []ast.Node{
+ &ast.Term{
+ Foundry: "opennlp",
+ Key: "PIDAT",
+ Layer: "p",
+ Match: ast.MatchEqual,
+ },
+ &ast.Term{
+ Foundry: "opennlp",
+ Key: "AdjType",
+ Layer: "p",
+ Match: ast.MatchEqual,
+ Value: "Pdt",
+ },
+ },
+ Relation: ast.AndRelation,
+ },
+ },
+ },
+ {
+ name: "Term group with or relation",
+ input: "[opennlp/p=PronType:Ind | opennlp/p=PronType:Neg]",
+ defaultFoundry: "opennlp",
+ defaultLayer: "p",
+ expected: &ast.Token{
+ Wrap: &ast.TermGroup{
+ Operands: []ast.Node{
+ &ast.Term{
+ Foundry: "opennlp",
+ Key: "PronType",
+ Layer: "p",
+ Match: ast.MatchEqual,
+ Value: "Ind",
+ },
+ &ast.Term{
+ Foundry: "opennlp",
+ Key: "PronType",
+ Layer: "p",
+ Match: ast.MatchEqual,
+ Value: "Neg",
+ },
+ },
+ Relation: ast.OrRelation,
+ },
+ },
+ },
+ {
+ name: "Complex term group",
+ input: "[opennlp/p=PIDAT & (opennlp/p=PronType:Ind | opennlp/p=PronType:Neg)]",
+ defaultFoundry: "opennlp",
+ defaultLayer: "p",
+ expected: &ast.Token{
+ Wrap: &ast.TermGroup{
+ Operands: []ast.Node{
+ &ast.Term{
+ Foundry: "opennlp",
+ Key: "PIDAT",
+ Layer: "p",
+ Match: ast.MatchEqual,
+ },
+ &ast.TermGroup{
+ Operands: []ast.Node{
+ &ast.Term{
+ Foundry: "opennlp",
+ Key: "PronType",
+ Layer: "p",
+ Match: ast.MatchEqual,
+ Value: "Ind",
+ },
+ &ast.Term{
+ Foundry: "opennlp",
+ Key: "PronType",
+ Layer: "p",
+ Match: ast.MatchEqual,
+ Value: "Neg",
+ },
+ },
+ Relation: ast.OrRelation,
+ },
+ },
+ Relation: ast.AndRelation,
+ },
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ parser, err := NewGrammarParser(tt.defaultFoundry, tt.defaultLayer)
+ require.NoError(t, err)
+
+ result, err := parser.Parse(tt.input)
+ if tt.expectError {
+ assert.Error(t, err)
+ return
+ }
+ require.NoError(t, err)
+ assert.Equal(t, tt.expected, result)
+ })
+ }
+}