blob: 8ee167f1524e3a4cde3494026a01f254ed92c71a [file] [log] [blame]
Akron22322ec2025-05-21 11:17:30 +02001package parser
2
3import (
4 "fmt"
5 "strings"
6
Akronfa55bb22025-05-26 15:10:42 +02007 "github.com/KorAP/KoralPipe-TermMapper/ast"
Akron22322ec2025-05-21 11:17:30 +02008 "github.com/alecthomas/participle/v2"
9 "github.com/alecthomas/participle/v2/lexer"
10)
11
12// GrammarParser parses a simple grammar into AST nodes
13type GrammarParser struct {
14 defaultFoundry string
15 defaultLayer string
Akronbb5065f2025-05-21 12:44:05 +020016 tokenParser *participle.Parser[TokenGrammar]
17 mappingParser *participle.Parser[MappingGrammar]
Akron22322ec2025-05-21 11:17:30 +020018}
19
Akronbb5065f2025-05-21 12:44:05 +020020// TokenGrammar represents a single token expression
21type TokenGrammar struct {
22 Token *TokenExpr `parser:"@@"`
23}
24
25// MappingGrammar represents a mapping rule
26type MappingGrammar struct {
27 Mapping *MappingRule `parser:"@@"`
28}
29
Akronbb5065f2025-05-21 12:44:05 +020030// MappingRule represents a mapping between two token expressions
31type MappingRule struct {
32 Upper *TokenExpr `parser:"@@"`
33 Lower *TokenExpr `parser:"'<>' @@"`
Akron22322ec2025-05-21 11:17:30 +020034}
35
36// TokenExpr represents a token expression in square brackets
37type TokenExpr struct {
Akronb40f5ac2025-05-21 11:22:33 +020038 Expr *Expr `parser:"'[' @@ ']'"`
Akron22322ec2025-05-21 11:17:30 +020039}
40
41// Expr represents a sequence of terms and operators
42type Expr struct {
Akronb40f5ac2025-05-21 11:22:33 +020043 First *Term `parser:"@@"`
44 Rest []*Op `parser:"@@*"`
Akron22322ec2025-05-21 11:17:30 +020045}
46
47type Op struct {
Akronb40f5ac2025-05-21 11:22:33 +020048 Operator string `parser:"@('&' | '|')"`
49 Term *Term `parser:"@@"`
Akron22322ec2025-05-21 11:17:30 +020050}
51
52// Term represents either a simple term or a parenthesized expression
53type Term struct {
Akronb40f5ac2025-05-21 11:22:33 +020054 Simple *SimpleTerm `parser:"@@"`
55 Paren *ParenExpr `parser:"| @@"`
Akron22322ec2025-05-21 11:17:30 +020056}
57
58type ParenExpr struct {
Akronb40f5ac2025-05-21 11:22:33 +020059 Expr *Expr `parser:"'(' @@ ')'"`
Akron22322ec2025-05-21 11:17:30 +020060}
61
62// SimpleTerm represents any valid term form
63type SimpleTerm struct {
Akroncc25e932025-06-02 19:39:43 +020064 WithFoundryLayer *FoundryLayerTerm `parser:"@@"`
65 WithFoundryWildcard *FoundryWildcardTerm `parser:"| @@"`
66 WithFoundryKey *FoundryKeyTerm `parser:"| @@"`
67 WithLayer *LayerTerm `parser:"| @@"`
68 SimpleKey *KeyTerm `parser:"| @@"`
Akron22322ec2025-05-21 11:17:30 +020069}
70
71// FoundryLayerTerm represents foundry/layer=key:value
72type FoundryLayerTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +020073 Foundry string `parser:"@Ident '/'"`
74 Layer string `parser:"@Ident '='"`
75 Key string `parser:"@Ident"`
76 Value string `parser:"(':' @Ident)?"`
Akron22322ec2025-05-21 11:17:30 +020077}
78
Akroncc25e932025-06-02 19:39:43 +020079// FoundryWildcardTerm represents foundry/*=key (wildcard layer)
80type FoundryWildcardTerm struct {
81 Foundry string `parser:"@Ident '/' '*' '='"`
82 Key string `parser:"@Ident"`
83}
84
Akron22322ec2025-05-21 11:17:30 +020085// FoundryKeyTerm represents foundry/key
86type FoundryKeyTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +020087 Foundry string `parser:"@Ident '/'"`
88 Key string `parser:"@Ident"`
Akron22322ec2025-05-21 11:17:30 +020089}
90
Akroncc25e932025-06-02 19:39:43 +020091// LayerTerm represents layer=key:value (only when no foundry is present)
Akron22322ec2025-05-21 11:17:30 +020092type LayerTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +020093 Layer string `parser:"@Ident '='"`
94 Key string `parser:"@Ident"`
95 Value string `parser:"(':' @Ident)?"`
Akron22322ec2025-05-21 11:17:30 +020096}
97
Akroncc25e932025-06-02 19:39:43 +020098// KeyTerm represents key:value or key=value
Akron22322ec2025-05-21 11:17:30 +020099type KeyTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +0200100 Key string `parser:"@Ident"`
Akroncc25e932025-06-02 19:39:43 +0200101 Value string `parser:"((':' | '=') @Ident)?"`
102}
103
104// EscapedPunct represents an escaped punctuation character like \(
105type EscapedPunct struct {
106 Prefix string `parser:"@Ident"`
107 Punct string `parser:"@Punct"`
Akron22322ec2025-05-21 11:17:30 +0200108}
109
110// NewGrammarParser creates a new grammar parser with optional default foundry and layer
111func NewGrammarParser(defaultFoundry, defaultLayer string) (*GrammarParser, error) {
112 lex := lexer.MustSimple([]lexer.SimpleRule{
Akroncc25e932025-06-02 19:39:43 +0200113 {Name: "Ident", Pattern: `(?:[a-zA-Z$,.]|\\.)(?:[a-zA-Z0-9_$,.]|\\.)*`},
114 {Name: "Punct", Pattern: `[\[\]()&\|=:/\*]|<>`},
Akron22322ec2025-05-21 11:17:30 +0200115 {Name: "Whitespace", Pattern: `\s+`},
116 })
117
Akronbb5065f2025-05-21 12:44:05 +0200118 tokenParser, err := participle.Build[TokenGrammar](
Akron22322ec2025-05-21 11:17:30 +0200119 participle.Lexer(lex),
120 participle.UseLookahead(2),
121 participle.Elide("Whitespace"),
122 )
123 if err != nil {
Akronbb5065f2025-05-21 12:44:05 +0200124 return nil, fmt.Errorf("failed to build token parser: %w", err)
125 }
126
127 mappingParser, err := participle.Build[MappingGrammar](
128 participle.Lexer(lex),
129 participle.UseLookahead(2),
130 participle.Elide("Whitespace"),
131 )
132 if err != nil {
133 return nil, fmt.Errorf("failed to build mapping parser: %w", err)
Akron22322ec2025-05-21 11:17:30 +0200134 }
135
136 return &GrammarParser{
137 defaultFoundry: defaultFoundry,
138 defaultLayer: defaultLayer,
Akronbb5065f2025-05-21 12:44:05 +0200139 tokenParser: tokenParser,
140 mappingParser: mappingParser,
Akron22322ec2025-05-21 11:17:30 +0200141 }, nil
142}
143
Akronbb5065f2025-05-21 12:44:05 +0200144// Parse parses a grammar string into an AST node (for backward compatibility)
Akron22322ec2025-05-21 11:17:30 +0200145func (p *GrammarParser) Parse(input string) (ast.Node, error) {
146 // Remove extra spaces around operators to help the parser
147 input = strings.ReplaceAll(input, " & ", "&")
148 input = strings.ReplaceAll(input, " | ", "|")
149
Akron76b87972025-06-02 16:59:59 +0200150 // Add spaces around parentheses that are not escaped
151 // We need to be careful not to break escape sequences like \(
152 result := make([]rune, 0, len(input)*2)
153 runes := []rune(input)
154 for i, r := range runes {
155 if (r == '(' || r == ')') && (i == 0 || runes[i-1] != '\\') {
Akroncc25e932025-06-02 19:39:43 +0200156 // Only add spaces if the parenthesis is not escaped and not part of an identifier
157 // Check if this parenthesis is inside brackets (part of an identifier)
158 insideBrackets := false
159 bracketDepth := 0
160 for j := 0; j < i; j++ {
161 if runes[j] == '[' {
162 bracketDepth++
163 } else if runes[j] == ']' {
164 bracketDepth--
165 }
166 }
167 insideBrackets = bracketDepth > 0
168
169 if !insideBrackets {
170 result = append(result, ' ', r, ' ')
171 } else {
172 result = append(result, r)
173 }
Akron76b87972025-06-02 16:59:59 +0200174 } else {
175 result = append(result, r)
176 }
177 }
178 input = string(result)
Akron22322ec2025-05-21 11:17:30 +0200179
180 // Remove any extra spaces
181 input = strings.TrimSpace(input)
182
Akronbb5065f2025-05-21 12:44:05 +0200183 grammar, err := p.tokenParser.ParseString("", input)
Akron22322ec2025-05-21 11:17:30 +0200184 if err != nil {
185 return nil, fmt.Errorf("failed to parse grammar: %w", err)
186 }
187
Akronbb5065f2025-05-21 12:44:05 +0200188 if grammar.Token == nil {
189 return nil, fmt.Errorf("expected token expression, got mapping rule")
190 }
191
Akron22322ec2025-05-21 11:17:30 +0200192 wrap, err := p.parseExpr(grammar.Token.Expr)
193 if err != nil {
194 return nil, err
195 }
196 return &ast.Token{Wrap: wrap}, nil
197}
198
Akronbb5065f2025-05-21 12:44:05 +0200199// ParseMapping parses a mapping rule string into a MappingResult
200func (p *GrammarParser) ParseMapping(input string) (*MappingResult, error) {
201 // Remove extra spaces around operators to help the parser
202 input = strings.ReplaceAll(input, " & ", "&")
203 input = strings.ReplaceAll(input, " | ", "|")
204 input = strings.ReplaceAll(input, " <> ", "<>")
205
Akron76b87972025-06-02 16:59:59 +0200206 // Add spaces around parentheses that are not escaped
207 // We need to be careful not to break escape sequences like \(
208 result := make([]rune, 0, len(input)*2)
209 runes := []rune(input)
210 for i, r := range runes {
211 if (r == '(' || r == ')') && (i == 0 || runes[i-1] != '\\') {
Akroncc25e932025-06-02 19:39:43 +0200212 // Only add spaces if the parenthesis is not escaped and not part of an identifier
213 // Check if this parenthesis is inside brackets (part of an identifier)
214 insideBrackets := false
215 bracketDepth := 0
216 for j := 0; j < i; j++ {
217 if runes[j] == '[' {
218 bracketDepth++
219 } else if runes[j] == ']' {
220 bracketDepth--
221 }
222 }
223 insideBrackets = bracketDepth > 0
224
225 if !insideBrackets {
226 result = append(result, ' ', r, ' ')
227 } else {
228 result = append(result, r)
229 }
Akron76b87972025-06-02 16:59:59 +0200230 } else {
231 result = append(result, r)
232 }
233 }
234 input = string(result)
Akronbb5065f2025-05-21 12:44:05 +0200235
236 // Remove any extra spaces
237 input = strings.TrimSpace(input)
238
239 grammar, err := p.mappingParser.ParseString("", input)
240 if err != nil {
241 return nil, fmt.Errorf("failed to parse grammar: %w", err)
242 }
243
244 if grammar.Mapping == nil {
245 return nil, fmt.Errorf("expected mapping rule, got token expression")
246 }
247
248 upper, err := p.parseExpr(grammar.Mapping.Upper.Expr)
249 if err != nil {
250 return nil, err
251 }
252
253 lower, err := p.parseExpr(grammar.Mapping.Lower.Expr)
254 if err != nil {
255 return nil, err
256 }
257
258 return &MappingResult{
259 Upper: &ast.Token{Wrap: upper},
260 Lower: &ast.Token{Wrap: lower},
261 }, nil
262}
263
264// MappingResult represents the parsed mapping rule
265type MappingResult struct {
266 Upper *ast.Token
267 Lower *ast.Token
268}
269
Akron22322ec2025-05-21 11:17:30 +0200270// parseExpr builds the AST from the parsed Expr
271func (p *GrammarParser) parseExpr(expr *Expr) (ast.Node, error) {
272 var operands []ast.Node
273 var operators []string
274
275 // Parse the first term
276 first, err := p.parseTerm(expr.First)
277 if err != nil {
278 return nil, err
279 }
280 operands = append(operands, first)
281
282 // Parse the rest
283 for _, op := range expr.Rest {
284 node, err := p.parseTerm(op.Term)
285 if err != nil {
286 return nil, err
287 }
288 operands = append(operands, node)
289 operators = append(operators, op.Operator)
290 }
291
292 // If only one operand, return it
293 if len(operands) == 1 {
294 return operands[0], nil
295 }
296
297 // Group operands by operator precedence (left-to-right, no precedence between & and |)
298 // We'll group by runs of the same operator
299 var groupOperands []ast.Node
300 var currentOp string
301 var currentGroup []ast.Node
302 for i, op := range operators {
303 if i == 0 {
304 currentOp = op
305 currentGroup = append(currentGroup, operands[i])
306 }
307 if op == currentOp {
308 currentGroup = append(currentGroup, operands[i+1])
309 } else {
310 groupOperands = append(groupOperands, &ast.TermGroup{
311 Operands: append([]ast.Node{}, currentGroup...),
312 Relation: toRelation(currentOp),
313 })
314 currentOp = op
315 currentGroup = []ast.Node{operands[i+1]}
316 }
317 }
318 if len(currentGroup) > 0 {
319 groupOperands = append(groupOperands, &ast.TermGroup{
320 Operands: append([]ast.Node{}, currentGroup...),
321 Relation: toRelation(currentOp),
322 })
323 }
324 if len(groupOperands) == 1 {
325 return groupOperands[0], nil
326 }
327 // If mixed operators, nest them left-to-right
328 result := groupOperands[0]
329 for i := 1; i < len(groupOperands); i++ {
330 result = &ast.TermGroup{
331 Operands: []ast.Node{result, groupOperands[i]},
332 Relation: toRelation(operators[0]),
333 }
334 }
335 return result, nil
336}
337
338// parseTerm converts a Term into an AST node
339func (p *GrammarParser) parseTerm(term *Term) (ast.Node, error) {
340 if term.Simple != nil {
341 return p.parseSimpleTerm(term.Simple)
342 }
343 if term.Paren != nil {
344 return p.parseExpr(term.Paren.Expr)
345 }
346 return nil, fmt.Errorf("invalid term: neither simple nor parenthesized")
347}
348
349func toRelation(op string) ast.RelationType {
350 if op == "|" {
351 return ast.OrRelation
352 }
353 return ast.AndRelation
354}
355
Akron121c66e2025-06-02 16:34:05 +0200356// unescapeString handles unescaping of backslash-escaped characters
357func unescapeString(s string) string {
358 if s == "" {
359 return s
360 }
361
362 result := make([]byte, 0, len(s))
363 i := 0
364 for i < len(s) {
365 if s[i] == '\\' && i+1 < len(s) {
366 // Escape sequence found, add the escaped character
367 result = append(result, s[i+1])
368 i += 2
369 } else {
370 // Regular character
371 result = append(result, s[i])
372 i++
373 }
374 }
375 return string(result)
376}
377
Akron22322ec2025-05-21 11:17:30 +0200378// parseSimpleTerm converts a SimpleTerm into an AST Term node
379func (p *GrammarParser) parseSimpleTerm(term *SimpleTerm) (ast.Node, error) {
380 var foundry, layer, key, value string
381
382 switch {
383 case term.WithFoundryLayer != nil:
Akron121c66e2025-06-02 16:34:05 +0200384 foundry = unescapeString(term.WithFoundryLayer.Foundry)
385 layer = unescapeString(term.WithFoundryLayer.Layer)
386 key = unescapeString(term.WithFoundryLayer.Key)
387 value = unescapeString(term.WithFoundryLayer.Value)
Akroncc25e932025-06-02 19:39:43 +0200388 case term.WithFoundryWildcard != nil:
389 foundry = unescapeString(term.WithFoundryWildcard.Foundry)
390 key = unescapeString(term.WithFoundryWildcard.Key)
Akron22322ec2025-05-21 11:17:30 +0200391 case term.WithFoundryKey != nil:
Akron121c66e2025-06-02 16:34:05 +0200392 foundry = unescapeString(term.WithFoundryKey.Foundry)
393 key = unescapeString(term.WithFoundryKey.Key)
Akron22322ec2025-05-21 11:17:30 +0200394 case term.WithLayer != nil:
Akroncc25e932025-06-02 19:39:43 +0200395 // Special case: if LayerTerm was parsed but the layer doesn't match the default layer,
396 // treat it as a key=value pattern instead
397 parsedLayer := unescapeString(term.WithLayer.Layer)
398 parsedKey := unescapeString(term.WithLayer.Key)
399 parsedValue := unescapeString(term.WithLayer.Value)
400
401 if p.defaultLayer != "" && parsedLayer == p.defaultLayer {
402 // This is a genuine layer=key pattern when the layer matches the default
403 layer = parsedLayer
404 key = parsedKey
405 value = parsedValue
406 } else if p.defaultLayer != "" && parsedLayer != p.defaultLayer {
407 // This should be treated as key=value pattern when there's a default layer but it doesn't match
408 key = parsedLayer
409 value = parsedKey
410 } else {
411 // No default layer context, treat as genuine layer=key pattern
412 layer = parsedLayer
413 key = parsedKey
414 value = parsedValue
415 }
Akron22322ec2025-05-21 11:17:30 +0200416 case term.SimpleKey != nil:
Akron121c66e2025-06-02 16:34:05 +0200417 key = unescapeString(term.SimpleKey.Key)
418 value = unescapeString(term.SimpleKey.Value)
Akron22322ec2025-05-21 11:17:30 +0200419 default:
420 return nil, fmt.Errorf("invalid term: no valid form found")
421 }
422
423 if foundry == "" {
424 foundry = p.defaultFoundry
425 }
426 if layer == "" {
427 layer = p.defaultLayer
428 }
429
430 return &ast.Term{
431 Foundry: foundry,
432 Key: key,
433 Layer: layer,
434 Match: ast.MatchEqual,
435 Value: value,
436 }, nil
437}