Blame - pkg/parser/grammar_parser.go - KorAP/Koral-Mapper

blob: 6e677b6f3072bbda4135ce1069b9a3e5a0027e0d [file] [log] [blame]

Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	1	package parser
				2
				3	import (
				4	"fmt"
				5	"strings"
				6
				7	"github.com/KorAP/KoralPipe-TermMapper2/pkg/ast"
				8	"github.com/alecthomas/participle/v2"
				9	"github.com/alecthomas/participle/v2/lexer"
				10	)
				11
				12	// GrammarParser parses a simple grammar into AST nodes
				13	type GrammarParser struct {
				14	defaultFoundry string
				15	defaultLayer string
Akron	bb5065f	2025-05-21 12:44:05 +0200	[diff] [blame^]	16	tokenParser *participle.Parser[TokenGrammar]
				17	mappingParser *participle.Parser[MappingGrammar]
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	18	}
				19
Akron	bb5065f	2025-05-21 12:44:05 +0200	[diff] [blame^]	20	// TokenGrammar represents a single token expression
				21	type TokenGrammar struct {
				22	Token *TokenExpr `parser:"@@"`
				23	}
				24
				25	// MappingGrammar represents a mapping rule
				26	type MappingGrammar struct {
				27	Mapping *MappingRule `parser:"@@"`
				28	}
				29
				30	/*
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	31	// Grammar represents the root of our grammar
				32	type Grammar struct {
Akron	bb5065f	2025-05-21 12:44:05 +0200	[diff] [blame^]	33	Token *TokenExpr `parser:" @@"`
				34	Mapping *MappingRule `parser:"\| @@"`
				35	}*/
				36
				37	// MappingRule represents a mapping between two token expressions
				38	type MappingRule struct {
				39	Upper *TokenExpr `parser:"@@"`
				40	Lower *TokenExpr `parser:"'<>' @@"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	41	}
				42
				43	// TokenExpr represents a token expression in square brackets
				44	type TokenExpr struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame]	45	Expr *Expr `parser:"'[' @@ ']'"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	46	}
				47
				48	// Expr represents a sequence of terms and operators
				49	type Expr struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame]	50	First *Term `parser:"@@"`
				51	Rest []Op `parser:"@@"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	52	}
				53
				54	type Op struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame]	55	Operator string `parser:"@('&' \| '\|')"`
				56	Term *Term `parser:"@@"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	57	}
				58
				59	// Term represents either a simple term or a parenthesized expression
				60	type Term struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame]	61	Simple *SimpleTerm `parser:"@@"`
				62	Paren *ParenExpr `parser:"\| @@"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	63	}
				64
				65	type ParenExpr struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame]	66	Expr *Expr `parser:"'(' @@ ')'"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	67	}
				68
				69	// SimpleTerm represents any valid term form
				70	type SimpleTerm struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame]	71	WithFoundryLayer *FoundryLayerTerm `parser:"@@"`
				72	WithFoundryKey *FoundryKeyTerm `parser:"\| @@"`
				73	WithLayer *LayerTerm `parser:"\| @@"`
				74	SimpleKey *KeyTerm `parser:"\| @@"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	75	}
				76
				77	// FoundryLayerTerm represents foundry/layer=key:value
				78	type FoundryLayerTerm struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame]	79	Foundry string `parser:"@Ident '/'"`
				80	Layer string `parser:"@Ident '='"`
				81	Key string `parser:"@Ident"`
				82	Value string `parser:"(':' @Ident)?"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	83	}
				84
				85	// FoundryKeyTerm represents foundry/key
				86	type FoundryKeyTerm struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame]	87	Foundry string `parser:"@Ident '/'"`
				88	Key string `parser:"@Ident"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	89	}
				90
				91	// LayerTerm represents layer=key:value
				92	type LayerTerm struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame]	93	Layer string `parser:"@Ident '='"`
				94	Key string `parser:"@Ident"`
				95	Value string `parser:"(':' @Ident)?"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	96	}
				97
				98	// KeyTerm represents key:value
				99	type KeyTerm struct {
Akron	b40f5ac	2025-05-21 11:22:33 +0200	[diff] [blame]	100	Key string `parser:"@Ident"`
				101	Value string `parser:"(':' @Ident)?"`
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	102	}
				103
				104	// NewGrammarParser creates a new grammar parser with optional default foundry and layer
				105	func NewGrammarParser(defaultFoundry, defaultLayer string) (*GrammarParser, error) {
				106	lex := lexer.MustSimple([]lexer.SimpleRule{
				107	{Name: "Ident", Pattern: `[a-zA-Z][a-zA-Z0-9_]*`},
Akron	bb5065f	2025-05-21 12:44:05 +0200	[diff] [blame^]	108	{Name: "Punct", Pattern: `[\[\]()&\\|=:/]\|<>`},
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	109	{Name: "Whitespace", Pattern: `\s+`},
				110	})
				111
Akron	bb5065f	2025-05-21 12:44:05 +0200	[diff] [blame^]	112	tokenParser, err := participle.Build[TokenGrammar](
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	113	participle.Lexer(lex),
				114	participle.UseLookahead(2),
				115	participle.Elide("Whitespace"),
				116	)
				117	if err != nil {
Akron	bb5065f	2025-05-21 12:44:05 +0200	[diff] [blame^]	118	return nil, fmt.Errorf("failed to build token parser: %w", err)
				119	}
				120
				121	mappingParser, err := participle.Build[MappingGrammar](
				122	participle.Lexer(lex),
				123	participle.UseLookahead(2),
				124	participle.Elide("Whitespace"),
				125	)
				126	if err != nil {
				127	return nil, fmt.Errorf("failed to build mapping parser: %w", err)
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	128	}
				129
				130	return &GrammarParser{
				131	defaultFoundry: defaultFoundry,
				132	defaultLayer: defaultLayer,
Akron	bb5065f	2025-05-21 12:44:05 +0200	[diff] [blame^]	133	tokenParser: tokenParser,
				134	mappingParser: mappingParser,
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	135	}, nil
				136	}
				137
Akron	bb5065f	2025-05-21 12:44:05 +0200	[diff] [blame^]	138	// Parse parses a grammar string into an AST node (for backward compatibility)
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	139	func (p *GrammarParser) Parse(input string) (ast.Node, error) {
				140	// Remove extra spaces around operators to help the parser
				141	input = strings.ReplaceAll(input, " & ", "&")
				142	input = strings.ReplaceAll(input, " \| ", "\|")
				143
				144	// Add spaces around parentheses to help the parser
				145	input = strings.ReplaceAll(input, "(", " ( ")
				146	input = strings.ReplaceAll(input, ")", " ) ")
				147
				148	// Remove any extra spaces
				149	input = strings.TrimSpace(input)
				150
Akron	bb5065f	2025-05-21 12:44:05 +0200	[diff] [blame^]	151	grammar, err := p.tokenParser.ParseString("", input)
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	152	if err != nil {
				153	return nil, fmt.Errorf("failed to parse grammar: %w", err)
				154	}
				155
Akron	bb5065f	2025-05-21 12:44:05 +0200	[diff] [blame^]	156	if grammar.Token == nil {
				157	return nil, fmt.Errorf("expected token expression, got mapping rule")
				158	}
				159
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	160	wrap, err := p.parseExpr(grammar.Token.Expr)
				161	if err != nil {
				162	return nil, err
				163	}
				164	return &ast.Token{Wrap: wrap}, nil
				165	}
				166
Akron	bb5065f	2025-05-21 12:44:05 +0200	[diff] [blame^]	167	// ParseMapping parses a mapping rule string into a MappingResult
				168	func (p GrammarParser) ParseMapping(input string) (MappingResult, error) {
				169	// Remove extra spaces around operators to help the parser
				170	input = strings.ReplaceAll(input, " & ", "&")
				171	input = strings.ReplaceAll(input, " \| ", "\|")
				172	input = strings.ReplaceAll(input, " <> ", "<>")
				173
				174	// Add spaces around parentheses to help the parser
				175	input = strings.ReplaceAll(input, "(", " ( ")
				176	input = strings.ReplaceAll(input, ")", " ) ")
				177
				178	// Remove any extra spaces
				179	input = strings.TrimSpace(input)
				180
				181	grammar, err := p.mappingParser.ParseString("", input)
				182	if err != nil {
				183	return nil, fmt.Errorf("failed to parse grammar: %w", err)
				184	}
				185
				186	if grammar.Mapping == nil {
				187	return nil, fmt.Errorf("expected mapping rule, got token expression")
				188	}
				189
				190	upper, err := p.parseExpr(grammar.Mapping.Upper.Expr)
				191	if err != nil {
				192	return nil, err
				193	}
				194
				195	lower, err := p.parseExpr(grammar.Mapping.Lower.Expr)
				196	if err != nil {
				197	return nil, err
				198	}
				199
				200	return &MappingResult{
				201	Upper: &ast.Token{Wrap: upper},
				202	Lower: &ast.Token{Wrap: lower},
				203	}, nil
				204	}
				205
				206	// MappingResult represents the parsed mapping rule
				207	type MappingResult struct {
				208	Upper *ast.Token
				209	Lower *ast.Token
				210	}
				211
Akron	22322ec	2025-05-21 11:17:30 +0200	[diff] [blame]	212	// parseExpr builds the AST from the parsed Expr
				213	func (p GrammarParser) parseExpr(expr Expr) (ast.Node, error) {
				214	var operands []ast.Node
				215	var operators []string
				216
				217	// Parse the first term
				218	first, err := p.parseTerm(expr.First)
				219	if err != nil {
				220	return nil, err
				221	}
				222	operands = append(operands, first)
				223
				224	// Parse the rest
				225	for _, op := range expr.Rest {
				226	node, err := p.parseTerm(op.Term)
				227	if err != nil {
				228	return nil, err
				229	}
				230	operands = append(operands, node)
				231	operators = append(operators, op.Operator)
				232	}
				233
				234	// If only one operand, return it
				235	if len(operands) == 1 {
				236	return operands[0], nil
				237	}
				238
				239	// Group operands by operator precedence (left-to-right, no precedence between & and \|)
				240	// We'll group by runs of the same operator
				241	var groupOperands []ast.Node
				242	var currentOp string
				243	var currentGroup []ast.Node
				244	for i, op := range operators {
				245	if i == 0 {
				246	currentOp = op
				247	currentGroup = append(currentGroup, operands[i])
				248	}
				249	if op == currentOp {
				250	currentGroup = append(currentGroup, operands[i+1])
				251	} else {
				252	groupOperands = append(groupOperands, &ast.TermGroup{
				253	Operands: append([]ast.Node{}, currentGroup...),
				254	Relation: toRelation(currentOp),
				255	})
				256	currentOp = op
				257	currentGroup = []ast.Node{operands[i+1]}
				258	}
				259	}
				260	if len(currentGroup) > 0 {
				261	groupOperands = append(groupOperands, &ast.TermGroup{
				262	Operands: append([]ast.Node{}, currentGroup...),
				263	Relation: toRelation(currentOp),
				264	})
				265	}
				266	if len(groupOperands) == 1 {
				267	return groupOperands[0], nil
				268	}
				269	// If mixed operators, nest them left-to-right
				270	result := groupOperands[0]
				271	for i := 1; i < len(groupOperands); i++ {
				272	result = &ast.TermGroup{
				273	Operands: []ast.Node{result, groupOperands[i]},
				274	Relation: toRelation(operators[0]),
				275	}
				276	}
				277	return result, nil
				278	}
				279
				280	// parseTerm converts a Term into an AST node
				281	func (p GrammarParser) parseTerm(term Term) (ast.Node, error) {
				282	if term.Simple != nil {
				283	return p.parseSimpleTerm(term.Simple)
				284	}
				285	if term.Paren != nil {
				286	return p.parseExpr(term.Paren.Expr)
				287	}
				288	return nil, fmt.Errorf("invalid term: neither simple nor parenthesized")
				289	}
				290
				291	func toRelation(op string) ast.RelationType {
				292	if op == "\|" {
				293	return ast.OrRelation
				294	}
				295	return ast.AndRelation
				296	}
				297
				298	// parseSimpleTerm converts a SimpleTerm into an AST Term node
				299	func (p GrammarParser) parseSimpleTerm(term SimpleTerm) (ast.Node, error) {
				300	var foundry, layer, key, value string
				301
				302	switch {
				303	case term.WithFoundryLayer != nil:
				304	foundry = term.WithFoundryLayer.Foundry
				305	layer = term.WithFoundryLayer.Layer
				306	key = term.WithFoundryLayer.Key
				307	value = term.WithFoundryLayer.Value
				308	case term.WithFoundryKey != nil:
				309	foundry = term.WithFoundryKey.Foundry
				310	key = term.WithFoundryKey.Key
				311	case term.WithLayer != nil:
				312	layer = term.WithLayer.Layer
				313	key = term.WithLayer.Key
				314	value = term.WithLayer.Value
				315	case term.SimpleKey != nil:
				316	key = term.SimpleKey.Key
				317	value = term.SimpleKey.Value
				318	default:
				319	return nil, fmt.Errorf("invalid term: no valid form found")
				320	}
				321
				322	if foundry == "" {
				323	foundry = p.defaultFoundry
				324	}
				325	if layer == "" {
				326	layer = p.defaultLayer
				327	}
				328
				329	return &ast.Term{
				330	Foundry: foundry,
				331	Key: key,
				332	Layer: layer,
				333	Match: ast.MatchEqual,
				334	Value: value,
				335	}, nil
				336	}