blob: 81ebe62fed85a56967afc3360e22e5f8763c0aba [file] [log] [blame]
Akron22322ec2025-05-21 11:17:30 +02001package parser
2
3import (
4 "fmt"
5 "strings"
6
Akronfa55bb22025-05-26 15:10:42 +02007 "github.com/KorAP/KoralPipe-TermMapper/ast"
Akron22322ec2025-05-21 11:17:30 +02008 "github.com/alecthomas/participle/v2"
9 "github.com/alecthomas/participle/v2/lexer"
10)
11
12// GrammarParser parses a simple grammar into AST nodes
13type GrammarParser struct {
14 defaultFoundry string
15 defaultLayer string
Akronbb5065f2025-05-21 12:44:05 +020016 tokenParser *participle.Parser[TokenGrammar]
17 mappingParser *participle.Parser[MappingGrammar]
Akron22322ec2025-05-21 11:17:30 +020018}
19
Akronbb5065f2025-05-21 12:44:05 +020020// TokenGrammar represents a single token expression
21type TokenGrammar struct {
22 Token *TokenExpr `parser:"@@"`
23}
24
25// MappingGrammar represents a mapping rule
26type MappingGrammar struct {
27 Mapping *MappingRule `parser:"@@"`
28}
29
Akronbb5065f2025-05-21 12:44:05 +020030// MappingRule represents a mapping between two token expressions
31type MappingRule struct {
32 Upper *TokenExpr `parser:"@@"`
33 Lower *TokenExpr `parser:"'<>' @@"`
Akron22322ec2025-05-21 11:17:30 +020034}
35
36// TokenExpr represents a token expression in square brackets
37type TokenExpr struct {
Akronb40f5ac2025-05-21 11:22:33 +020038 Expr *Expr `parser:"'[' @@ ']'"`
Akron22322ec2025-05-21 11:17:30 +020039}
40
41// Expr represents a sequence of terms and operators
42type Expr struct {
Akronb40f5ac2025-05-21 11:22:33 +020043 First *Term `parser:"@@"`
44 Rest []*Op `parser:"@@*"`
Akron22322ec2025-05-21 11:17:30 +020045}
46
47type Op struct {
Akronb40f5ac2025-05-21 11:22:33 +020048 Operator string `parser:"@('&' | '|')"`
49 Term *Term `parser:"@@"`
Akron22322ec2025-05-21 11:17:30 +020050}
51
52// Term represents either a simple term or a parenthesized expression
53type Term struct {
Akronb40f5ac2025-05-21 11:22:33 +020054 Simple *SimpleTerm `parser:"@@"`
55 Paren *ParenExpr `parser:"| @@"`
Akron22322ec2025-05-21 11:17:30 +020056}
57
58type ParenExpr struct {
Akronb40f5ac2025-05-21 11:22:33 +020059 Expr *Expr `parser:"'(' @@ ')'"`
Akron22322ec2025-05-21 11:17:30 +020060}
61
62// SimpleTerm represents any valid term form
63type SimpleTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +020064 WithFoundryLayer *FoundryLayerTerm `parser:"@@"`
65 WithFoundryKey *FoundryKeyTerm `parser:"| @@"`
66 WithLayer *LayerTerm `parser:"| @@"`
67 SimpleKey *KeyTerm `parser:"| @@"`
Akron22322ec2025-05-21 11:17:30 +020068}
69
70// FoundryLayerTerm represents foundry/layer=key:value
71type FoundryLayerTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +020072 Foundry string `parser:"@Ident '/'"`
73 Layer string `parser:"@Ident '='"`
74 Key string `parser:"@Ident"`
75 Value string `parser:"(':' @Ident)?"`
Akron22322ec2025-05-21 11:17:30 +020076}
77
78// FoundryKeyTerm represents foundry/key
79type FoundryKeyTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +020080 Foundry string `parser:"@Ident '/'"`
81 Key string `parser:"@Ident"`
Akron22322ec2025-05-21 11:17:30 +020082}
83
84// LayerTerm represents layer=key:value
85type LayerTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +020086 Layer string `parser:"@Ident '='"`
87 Key string `parser:"@Ident"`
88 Value string `parser:"(':' @Ident)?"`
Akron22322ec2025-05-21 11:17:30 +020089}
90
91// KeyTerm represents key:value
92type KeyTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +020093 Key string `parser:"@Ident"`
94 Value string `parser:"(':' @Ident)?"`
Akron22322ec2025-05-21 11:17:30 +020095}
96
97// NewGrammarParser creates a new grammar parser with optional default foundry and layer
98func NewGrammarParser(defaultFoundry, defaultLayer string) (*GrammarParser, error) {
99 lex := lexer.MustSimple([]lexer.SimpleRule{
Akron121c66e2025-06-02 16:34:05 +0200100 {Name: "Ident", Pattern: `(?:[a-zA-Z$]|\\.)(?:[a-zA-Z0-9_$]|\\.)*`},
Akronbb5065f2025-05-21 12:44:05 +0200101 {Name: "Punct", Pattern: `[\[\]()&\|=:/]|<>`},
Akron22322ec2025-05-21 11:17:30 +0200102 {Name: "Whitespace", Pattern: `\s+`},
103 })
104
Akronbb5065f2025-05-21 12:44:05 +0200105 tokenParser, err := participle.Build[TokenGrammar](
Akron22322ec2025-05-21 11:17:30 +0200106 participle.Lexer(lex),
107 participle.UseLookahead(2),
108 participle.Elide("Whitespace"),
109 )
110 if err != nil {
Akronbb5065f2025-05-21 12:44:05 +0200111 return nil, fmt.Errorf("failed to build token parser: %w", err)
112 }
113
114 mappingParser, err := participle.Build[MappingGrammar](
115 participle.Lexer(lex),
116 participle.UseLookahead(2),
117 participle.Elide("Whitespace"),
118 )
119 if err != nil {
120 return nil, fmt.Errorf("failed to build mapping parser: %w", err)
Akron22322ec2025-05-21 11:17:30 +0200121 }
122
123 return &GrammarParser{
124 defaultFoundry: defaultFoundry,
125 defaultLayer: defaultLayer,
Akronbb5065f2025-05-21 12:44:05 +0200126 tokenParser: tokenParser,
127 mappingParser: mappingParser,
Akron22322ec2025-05-21 11:17:30 +0200128 }, nil
129}
130
Akronbb5065f2025-05-21 12:44:05 +0200131// Parse parses a grammar string into an AST node (for backward compatibility)
Akron22322ec2025-05-21 11:17:30 +0200132func (p *GrammarParser) Parse(input string) (ast.Node, error) {
133 // Remove extra spaces around operators to help the parser
134 input = strings.ReplaceAll(input, " & ", "&")
135 input = strings.ReplaceAll(input, " | ", "|")
136
Akron76b87972025-06-02 16:59:59 +0200137 // Add spaces around parentheses that are not escaped
138 // We need to be careful not to break escape sequences like \(
139 result := make([]rune, 0, len(input)*2)
140 runes := []rune(input)
141 for i, r := range runes {
142 if (r == '(' || r == ')') && (i == 0 || runes[i-1] != '\\') {
143 // Only add spaces if the parenthesis is not escaped
144 result = append(result, ' ', r, ' ')
145 } else {
146 result = append(result, r)
147 }
148 }
149 input = string(result)
Akron22322ec2025-05-21 11:17:30 +0200150
151 // Remove any extra spaces
152 input = strings.TrimSpace(input)
153
Akronbb5065f2025-05-21 12:44:05 +0200154 grammar, err := p.tokenParser.ParseString("", input)
Akron22322ec2025-05-21 11:17:30 +0200155 if err != nil {
156 return nil, fmt.Errorf("failed to parse grammar: %w", err)
157 }
158
Akronbb5065f2025-05-21 12:44:05 +0200159 if grammar.Token == nil {
160 return nil, fmt.Errorf("expected token expression, got mapping rule")
161 }
162
Akron22322ec2025-05-21 11:17:30 +0200163 wrap, err := p.parseExpr(grammar.Token.Expr)
164 if err != nil {
165 return nil, err
166 }
167 return &ast.Token{Wrap: wrap}, nil
168}
169
Akronbb5065f2025-05-21 12:44:05 +0200170// ParseMapping parses a mapping rule string into a MappingResult
171func (p *GrammarParser) ParseMapping(input string) (*MappingResult, error) {
172 // Remove extra spaces around operators to help the parser
173 input = strings.ReplaceAll(input, " & ", "&")
174 input = strings.ReplaceAll(input, " | ", "|")
175 input = strings.ReplaceAll(input, " <> ", "<>")
176
Akron76b87972025-06-02 16:59:59 +0200177 // Add spaces around parentheses that are not escaped
178 // We need to be careful not to break escape sequences like \(
179 result := make([]rune, 0, len(input)*2)
180 runes := []rune(input)
181 for i, r := range runes {
182 if (r == '(' || r == ')') && (i == 0 || runes[i-1] != '\\') {
183 // Only add spaces if the parenthesis is not escaped
184 result = append(result, ' ', r, ' ')
185 } else {
186 result = append(result, r)
187 }
188 }
189 input = string(result)
Akronbb5065f2025-05-21 12:44:05 +0200190
191 // Remove any extra spaces
192 input = strings.TrimSpace(input)
193
194 grammar, err := p.mappingParser.ParseString("", input)
195 if err != nil {
196 return nil, fmt.Errorf("failed to parse grammar: %w", err)
197 }
198
199 if grammar.Mapping == nil {
200 return nil, fmt.Errorf("expected mapping rule, got token expression")
201 }
202
203 upper, err := p.parseExpr(grammar.Mapping.Upper.Expr)
204 if err != nil {
205 return nil, err
206 }
207
208 lower, err := p.parseExpr(grammar.Mapping.Lower.Expr)
209 if err != nil {
210 return nil, err
211 }
212
213 return &MappingResult{
214 Upper: &ast.Token{Wrap: upper},
215 Lower: &ast.Token{Wrap: lower},
216 }, nil
217}
218
219// MappingResult represents the parsed mapping rule
220type MappingResult struct {
221 Upper *ast.Token
222 Lower *ast.Token
223}
224
Akron22322ec2025-05-21 11:17:30 +0200225// parseExpr builds the AST from the parsed Expr
226func (p *GrammarParser) parseExpr(expr *Expr) (ast.Node, error) {
227 var operands []ast.Node
228 var operators []string
229
230 // Parse the first term
231 first, err := p.parseTerm(expr.First)
232 if err != nil {
233 return nil, err
234 }
235 operands = append(operands, first)
236
237 // Parse the rest
238 for _, op := range expr.Rest {
239 node, err := p.parseTerm(op.Term)
240 if err != nil {
241 return nil, err
242 }
243 operands = append(operands, node)
244 operators = append(operators, op.Operator)
245 }
246
247 // If only one operand, return it
248 if len(operands) == 1 {
249 return operands[0], nil
250 }
251
252 // Group operands by operator precedence (left-to-right, no precedence between & and |)
253 // We'll group by runs of the same operator
254 var groupOperands []ast.Node
255 var currentOp string
256 var currentGroup []ast.Node
257 for i, op := range operators {
258 if i == 0 {
259 currentOp = op
260 currentGroup = append(currentGroup, operands[i])
261 }
262 if op == currentOp {
263 currentGroup = append(currentGroup, operands[i+1])
264 } else {
265 groupOperands = append(groupOperands, &ast.TermGroup{
266 Operands: append([]ast.Node{}, currentGroup...),
267 Relation: toRelation(currentOp),
268 })
269 currentOp = op
270 currentGroup = []ast.Node{operands[i+1]}
271 }
272 }
273 if len(currentGroup) > 0 {
274 groupOperands = append(groupOperands, &ast.TermGroup{
275 Operands: append([]ast.Node{}, currentGroup...),
276 Relation: toRelation(currentOp),
277 })
278 }
279 if len(groupOperands) == 1 {
280 return groupOperands[0], nil
281 }
282 // If mixed operators, nest them left-to-right
283 result := groupOperands[0]
284 for i := 1; i < len(groupOperands); i++ {
285 result = &ast.TermGroup{
286 Operands: []ast.Node{result, groupOperands[i]},
287 Relation: toRelation(operators[0]),
288 }
289 }
290 return result, nil
291}
292
293// parseTerm converts a Term into an AST node
294func (p *GrammarParser) parseTerm(term *Term) (ast.Node, error) {
295 if term.Simple != nil {
296 return p.parseSimpleTerm(term.Simple)
297 }
298 if term.Paren != nil {
299 return p.parseExpr(term.Paren.Expr)
300 }
301 return nil, fmt.Errorf("invalid term: neither simple nor parenthesized")
302}
303
304func toRelation(op string) ast.RelationType {
305 if op == "|" {
306 return ast.OrRelation
307 }
308 return ast.AndRelation
309}
310
Akron121c66e2025-06-02 16:34:05 +0200311// unescapeString handles unescaping of backslash-escaped characters
312func unescapeString(s string) string {
313 if s == "" {
314 return s
315 }
316
317 result := make([]byte, 0, len(s))
318 i := 0
319 for i < len(s) {
320 if s[i] == '\\' && i+1 < len(s) {
321 // Escape sequence found, add the escaped character
322 result = append(result, s[i+1])
323 i += 2
324 } else {
325 // Regular character
326 result = append(result, s[i])
327 i++
328 }
329 }
330 return string(result)
331}
332
Akron22322ec2025-05-21 11:17:30 +0200333// parseSimpleTerm converts a SimpleTerm into an AST Term node
334func (p *GrammarParser) parseSimpleTerm(term *SimpleTerm) (ast.Node, error) {
335 var foundry, layer, key, value string
336
337 switch {
338 case term.WithFoundryLayer != nil:
Akron121c66e2025-06-02 16:34:05 +0200339 foundry = unescapeString(term.WithFoundryLayer.Foundry)
340 layer = unescapeString(term.WithFoundryLayer.Layer)
341 key = unescapeString(term.WithFoundryLayer.Key)
342 value = unescapeString(term.WithFoundryLayer.Value)
Akron22322ec2025-05-21 11:17:30 +0200343 case term.WithFoundryKey != nil:
Akron121c66e2025-06-02 16:34:05 +0200344 foundry = unescapeString(term.WithFoundryKey.Foundry)
345 key = unescapeString(term.WithFoundryKey.Key)
Akron22322ec2025-05-21 11:17:30 +0200346 case term.WithLayer != nil:
Akron121c66e2025-06-02 16:34:05 +0200347 layer = unescapeString(term.WithLayer.Layer)
348 key = unescapeString(term.WithLayer.Key)
349 value = unescapeString(term.WithLayer.Value)
Akron22322ec2025-05-21 11:17:30 +0200350 case term.SimpleKey != nil:
Akron121c66e2025-06-02 16:34:05 +0200351 key = unescapeString(term.SimpleKey.Key)
352 value = unescapeString(term.SimpleKey.Value)
Akron22322ec2025-05-21 11:17:30 +0200353 default:
354 return nil, fmt.Errorf("invalid term: no valid form found")
355 }
356
357 if foundry == "" {
358 foundry = p.defaultFoundry
359 }
360 if layer == "" {
361 layer = p.defaultLayer
362 }
363
364 return &ast.Term{
365 Foundry: foundry,
366 Key: key,
367 Layer: layer,
368 Match: ast.MatchEqual,
369 Value: value,
370 }, nil
371}