Blame - pkg/parser/grammar_parser.go - KorAP/Koral-Mapper

blob: 3124b828c098e3638016cd0865a6307d67390dd4 [file] [log] [blame]

Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	1	package parser
				2
				3	import (
				4	"fmt"
				5	"strings"
				6
				7	"github.com/KorAP/KoralPipe-TermMapper2/pkg/ast"
				8	"github.com/alecthomas/participle/v2"
				9	"github.com/alecthomas/participle/v2/lexer"
				10	)
				11
				12	// GrammarParser parses a simple grammar into AST nodes
				13	type GrammarParser struct {
				14	defaultFoundry string
				15	defaultLayer string
				16	parser *participle.Parser[Grammar]
				17	}
				18
				19	// Grammar represents the root of our grammar
				20	type Grammar struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame^]	21	Token *TokenExpr `parser:"@@"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	22	}
				23
				24	// TokenExpr represents a token expression in square brackets
				25	type TokenExpr struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame^]	26	Expr *Expr `parser:"'[' @@ ']'"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	27	}
				28
				29	// Expr represents a sequence of terms and operators
				30	type Expr struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame^]	31	First *Term `parser:"@@"`
				32	Rest []Op `parser:"@@"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	33	}
				34
				35	type Op struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame^]	36	Operator string `parser:"@('&' \| '\|')"`
				37	Term *Term `parser:"@@"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	38	}
				39
				40	// Term represents either a simple term or a parenthesized expression
				41	type Term struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame^]	42	Simple *SimpleTerm `parser:"@@"`
				43	Paren *ParenExpr `parser:"\| @@"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	44	}
				45
				46	type ParenExpr struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame^]	47	Expr *Expr `parser:"'(' @@ ')'"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	48	}
				49
				50	// SimpleTerm represents any valid term form
				51	type SimpleTerm struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame^]	52	WithFoundryLayer *FoundryLayerTerm `parser:"@@"`
				53	WithFoundryKey *FoundryKeyTerm `parser:"\| @@"`
				54	WithLayer *LayerTerm `parser:"\| @@"`
				55	SimpleKey *KeyTerm `parser:"\| @@"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	56	}
				57
				58	// FoundryLayerTerm represents foundry/layer=key:value
				59	type FoundryLayerTerm struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame^]	60	Foundry string `parser:"@Ident '/'"`
				61	Layer string `parser:"@Ident '='"`
				62	Key string `parser:"@Ident"`
				63	Value string `parser:"(':' @Ident)?"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	64	}
				65
				66	// FoundryKeyTerm represents foundry/key
				67	type FoundryKeyTerm struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame^]	68	Foundry string `parser:"@Ident '/'"`
				69	Key string `parser:"@Ident"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	70	}
				71
				72	// LayerTerm represents layer=key:value
				73	type LayerTerm struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame^]	74	Layer string `parser:"@Ident '='"`
				75	Key string `parser:"@Ident"`
				76	Value string `parser:"(':' @Ident)?"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	77	}
				78
				79	// KeyTerm represents key:value
				80	type KeyTerm struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame^]	81	Key string `parser:"@Ident"`
				82	Value string `parser:"(':' @Ident)?"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	83	}
				84
				85	// NewGrammarParser creates a new grammar parser with optional default foundry and layer
				86	func NewGrammarParser(defaultFoundry, defaultLayer string) (*GrammarParser, error) {
				87	lex := lexer.MustSimple([]lexer.SimpleRule{
				88	{Name: "Ident", Pattern: `[a-zA-Z][a-zA-Z0-9_]*`},
				89	{Name: "Punct", Pattern: `[\[\]()&\\|=:/]`},
				90	{Name: "Whitespace", Pattern: `\s+`},
				91	})
				92
				93	parser, err := participle.Build[Grammar](
				94	participle.Lexer(lex),
				95	participle.UseLookahead(2),
				96	participle.Elide("Whitespace"),
				97	)
				98	if err != nil {
				99	return nil, fmt.Errorf("failed to build parser: %w", err)
				100	}
				101
				102	return &GrammarParser{
				103	defaultFoundry: defaultFoundry,
				104	defaultLayer: defaultLayer,
				105	parser: parser,
				106	}, nil
				107	}
				108
				109	// Parse parses a grammar string into an AST node
				110	func (p *GrammarParser) Parse(input string) (ast.Node, error) {
				111	// Remove extra spaces around operators to help the parser
				112	input = strings.ReplaceAll(input, " & ", "&")
				113	input = strings.ReplaceAll(input, " \| ", "\|")
				114
				115	// Add spaces around parentheses to help the parser
				116	input = strings.ReplaceAll(input, "(", " ( ")
				117	input = strings.ReplaceAll(input, ")", " ) ")
				118
				119	// Remove any extra spaces
				120	input = strings.TrimSpace(input)
				121
				122	grammar, err := p.parser.ParseString("", input)
				123	if err != nil {
				124	return nil, fmt.Errorf("failed to parse grammar: %w", err)
				125	}
				126
				127	wrap, err := p.parseExpr(grammar.Token.Expr)
				128	if err != nil {
				129	return nil, err
				130	}
				131	return &ast.Token{Wrap: wrap}, nil
				132	}
				133
				134	// parseExpr builds the AST from the parsed Expr
				135	func (p GrammarParser) parseExpr(expr Expr) (ast.Node, error) {
				136	var operands []ast.Node
				137	var operators []string
				138
				139	// Parse the first term
				140	first, err := p.parseTerm(expr.First)
				141	if err != nil {
				142	return nil, err
				143	}
				144	operands = append(operands, first)
				145
				146	// Parse the rest
				147	for _, op := range expr.Rest {
				148	node, err := p.parseTerm(op.Term)
				149	if err != nil {
				150	return nil, err
				151	}
				152	operands = append(operands, node)
				153	operators = append(operators, op.Operator)
				154	}
				155
				156	// If only one operand, return it
				157	if len(operands) == 1 {
				158	return operands[0], nil
				159	}
				160
				161	// Group operands by operator precedence (left-to-right, no precedence between & and \|)
				162	// We'll group by runs of the same operator
				163	var groupOperands []ast.Node
				164	var currentOp string
				165	var currentGroup []ast.Node
				166	for i, op := range operators {
				167	if i == 0 {
				168	currentOp = op
				169	currentGroup = append(currentGroup, operands[i])
				170	}
				171	if op == currentOp {
				172	currentGroup = append(currentGroup, operands[i+1])
				173	} else {
				174	groupOperands = append(groupOperands, &ast.TermGroup{
				175	Operands: append([]ast.Node{}, currentGroup...),
				176	Relation: toRelation(currentOp),
				177	})
				178	currentOp = op
				179	currentGroup = []ast.Node{operands[i+1]}
				180	}
				181	}
				182	if len(currentGroup) > 0 {
				183	groupOperands = append(groupOperands, &ast.TermGroup{
				184	Operands: append([]ast.Node{}, currentGroup...),
				185	Relation: toRelation(currentOp),
				186	})
				187	}
				188	if len(groupOperands) == 1 {
				189	return groupOperands[0], nil
				190	}
				191	// If mixed operators, nest them left-to-right
				192	result := groupOperands[0]
				193	for i := 1; i < len(groupOperands); i++ {
				194	result = &ast.TermGroup{
				195	Operands: []ast.Node{result, groupOperands[i]},
				196	Relation: toRelation(operators[0]),
				197	}
				198	}
				199	return result, nil
				200	}
				201
				202	// parseTerm converts a Term into an AST node
				203	func (p GrammarParser) parseTerm(term Term) (ast.Node, error) {
				204	if term.Simple != nil {
				205	return p.parseSimpleTerm(term.Simple)
				206	}
				207	if term.Paren != nil {
				208	return p.parseExpr(term.Paren.Expr)
				209	}
				210	return nil, fmt.Errorf("invalid term: neither simple nor parenthesized")
				211	}
				212
				213	func toRelation(op string) ast.RelationType {
				214	if op == "\|" {
				215	return ast.OrRelation
				216	}
				217	return ast.AndRelation
				218	}
				219
				220	// parseSimpleTerm converts a SimpleTerm into an AST Term node
				221	func (p GrammarParser) parseSimpleTerm(term SimpleTerm) (ast.Node, error) {
				222	var foundry, layer, key, value string
				223
				224	switch {
				225	case term.WithFoundryLayer != nil:
				226	foundry = term.WithFoundryLayer.Foundry
				227	layer = term.WithFoundryLayer.Layer
				228	key = term.WithFoundryLayer.Key
				229	value = term.WithFoundryLayer.Value
				230	case term.WithFoundryKey != nil:
				231	foundry = term.WithFoundryKey.Foundry
				232	key = term.WithFoundryKey.Key
				233	case term.WithLayer != nil:
				234	layer = term.WithLayer.Layer
				235	key = term.WithLayer.Key
				236	value = term.WithLayer.Value
				237	case term.SimpleKey != nil:
				238	key = term.SimpleKey.Key
				239	value = term.SimpleKey.Value
				240	default:
				241	return nil, fmt.Errorf("invalid term: no valid form found")
				242	}
				243
				244	if foundry == "" {
				245	foundry = p.defaultFoundry
				246	}
				247	if layer == "" {
				248	layer = p.defaultLayer
				249	}
				250
				251	return &ast.Term{
				252	Foundry: foundry,
				253	Key: key,
				254	Layer: layer,
				255	Match: ast.MatchEqual,
				256	Value: value,
				257	}, nil
				258	}