blob: 4862ceaf6dac1cdd9b2631d6af9ab5017345c9ae [file] [log] [blame]
Akron22322ec2025-05-21 11:17:30 +02001package parser
2
3import (
4 "fmt"
5 "strings"
6
Akronfa55bb22025-05-26 15:10:42 +02007 "github.com/KorAP/KoralPipe-TermMapper/ast"
Akron22322ec2025-05-21 11:17:30 +02008 "github.com/alecthomas/participle/v2"
9 "github.com/alecthomas/participle/v2/lexer"
10)
11
12// GrammarParser parses a simple grammar into AST nodes
13type GrammarParser struct {
14 defaultFoundry string
15 defaultLayer string
Akronbb5065f2025-05-21 12:44:05 +020016 tokenParser *participle.Parser[TokenGrammar]
17 mappingParser *participle.Parser[MappingGrammar]
Akron22322ec2025-05-21 11:17:30 +020018}
19
Akronbb5065f2025-05-21 12:44:05 +020020// TokenGrammar represents a single token expression
21type TokenGrammar struct {
22 Token *TokenExpr `parser:"@@"`
23}
24
25// MappingGrammar represents a mapping rule
26type MappingGrammar struct {
27 Mapping *MappingRule `parser:"@@"`
28}
29
Akronbb5065f2025-05-21 12:44:05 +020030// MappingRule represents a mapping between two token expressions
31type MappingRule struct {
32 Upper *TokenExpr `parser:"@@"`
33 Lower *TokenExpr `parser:"'<>' @@"`
Akron22322ec2025-05-21 11:17:30 +020034}
35
36// TokenExpr represents a token expression in square brackets
37type TokenExpr struct {
Akronb40f5ac2025-05-21 11:22:33 +020038 Expr *Expr `parser:"'[' @@ ']'"`
Akron22322ec2025-05-21 11:17:30 +020039}
40
41// Expr represents a sequence of terms and operators
42type Expr struct {
Akronb40f5ac2025-05-21 11:22:33 +020043 First *Term `parser:"@@"`
44 Rest []*Op `parser:"@@*"`
Akron22322ec2025-05-21 11:17:30 +020045}
46
47type Op struct {
Akronb40f5ac2025-05-21 11:22:33 +020048 Operator string `parser:"@('&' | '|')"`
49 Term *Term `parser:"@@"`
Akron22322ec2025-05-21 11:17:30 +020050}
51
52// Term represents either a simple term or a parenthesized expression
53type Term struct {
Akronb40f5ac2025-05-21 11:22:33 +020054 Simple *SimpleTerm `parser:"@@"`
55 Paren *ParenExpr `parser:"| @@"`
Akron22322ec2025-05-21 11:17:30 +020056}
57
58type ParenExpr struct {
Akronb40f5ac2025-05-21 11:22:33 +020059 Expr *Expr `parser:"'(' @@ ')'"`
Akron22322ec2025-05-21 11:17:30 +020060}
61
62// SimpleTerm represents any valid term form
63type SimpleTerm struct {
Akroncc25e932025-06-02 19:39:43 +020064 WithFoundryLayer *FoundryLayerTerm `parser:"@@"`
65 WithFoundryWildcard *FoundryWildcardTerm `parser:"| @@"`
66 WithFoundryKey *FoundryKeyTerm `parser:"| @@"`
67 WithLayer *LayerTerm `parser:"| @@"`
68 SimpleKey *KeyTerm `parser:"| @@"`
Akron22322ec2025-05-21 11:17:30 +020069}
70
71// FoundryLayerTerm represents foundry/layer=key:value
72type FoundryLayerTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +020073 Foundry string `parser:"@Ident '/'"`
74 Layer string `parser:"@Ident '='"`
75 Key string `parser:"@Ident"`
76 Value string `parser:"(':' @Ident)?"`
Akron22322ec2025-05-21 11:17:30 +020077}
78
Akroncc25e932025-06-02 19:39:43 +020079// FoundryWildcardTerm represents foundry/*=key (wildcard layer)
80type FoundryWildcardTerm struct {
81 Foundry string `parser:"@Ident '/' '*' '='"`
82 Key string `parser:"@Ident"`
83}
84
Akron22322ec2025-05-21 11:17:30 +020085// FoundryKeyTerm represents foundry/key
86type FoundryKeyTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +020087 Foundry string `parser:"@Ident '/'"`
88 Key string `parser:"@Ident"`
Akron22322ec2025-05-21 11:17:30 +020089}
90
Akroncc25e932025-06-02 19:39:43 +020091// LayerTerm represents layer=key:value (only when no foundry is present)
Akron22322ec2025-05-21 11:17:30 +020092type LayerTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +020093 Layer string `parser:"@Ident '='"`
94 Key string `parser:"@Ident"`
95 Value string `parser:"(':' @Ident)?"`
Akron22322ec2025-05-21 11:17:30 +020096}
97
Akroncc25e932025-06-02 19:39:43 +020098// KeyTerm represents key:value or key=value
Akron22322ec2025-05-21 11:17:30 +020099type KeyTerm struct {
Akronb40f5ac2025-05-21 11:22:33 +0200100 Key string `parser:"@Ident"`
Akroncc25e932025-06-02 19:39:43 +0200101 Value string `parser:"((':' | '=') @Ident)?"`
102}
103
104// EscapedPunct represents an escaped punctuation character like \(
105type EscapedPunct struct {
106 Prefix string `parser:"@Ident"`
107 Punct string `parser:"@Punct"`
Akron22322ec2025-05-21 11:17:30 +0200108}
109
110// NewGrammarParser creates a new grammar parser with optional default foundry and layer
111func NewGrammarParser(defaultFoundry, defaultLayer string) (*GrammarParser, error) {
112 lex := lexer.MustSimple([]lexer.SimpleRule{
Akroncc25e932025-06-02 19:39:43 +0200113 {Name: "Ident", Pattern: `(?:[a-zA-Z$,.]|\\.)(?:[a-zA-Z0-9_$,.]|\\.)*`},
114 {Name: "Punct", Pattern: `[\[\]()&\|=:/\*]|<>`},
Akron22322ec2025-05-21 11:17:30 +0200115 {Name: "Whitespace", Pattern: `\s+`},
116 })
117
Akronbb5065f2025-05-21 12:44:05 +0200118 tokenParser, err := participle.Build[TokenGrammar](
Akron22322ec2025-05-21 11:17:30 +0200119 participle.Lexer(lex),
120 participle.UseLookahead(2),
121 participle.Elide("Whitespace"),
122 )
123 if err != nil {
Akronbb5065f2025-05-21 12:44:05 +0200124 return nil, fmt.Errorf("failed to build token parser: %w", err)
125 }
126
127 mappingParser, err := participle.Build[MappingGrammar](
128 participle.Lexer(lex),
129 participle.UseLookahead(2),
130 participle.Elide("Whitespace"),
131 )
132 if err != nil {
133 return nil, fmt.Errorf("failed to build mapping parser: %w", err)
Akron22322ec2025-05-21 11:17:30 +0200134 }
135
136 return &GrammarParser{
137 defaultFoundry: defaultFoundry,
138 defaultLayer: defaultLayer,
Akronbb5065f2025-05-21 12:44:05 +0200139 tokenParser: tokenParser,
140 mappingParser: mappingParser,
Akron22322ec2025-05-21 11:17:30 +0200141 }, nil
142}
143
Akron6b4c9eb2025-07-03 14:31:58 +0200144// preprocessInput normalizes the input string by handling operators and parentheses
145func (p *GrammarParser) preprocessInput(input string) string {
Akronbb5065f2025-05-21 12:44:05 +0200146 // Remove extra spaces around operators to help the parser
147 input = strings.ReplaceAll(input, " & ", "&")
148 input = strings.ReplaceAll(input, " | ", "|")
149 input = strings.ReplaceAll(input, " <> ", "<>")
150
Akron76b87972025-06-02 16:59:59 +0200151 // Add spaces around parentheses that are not escaped
Akron76b87972025-06-02 16:59:59 +0200152 result := make([]rune, 0, len(input)*2)
153 runes := []rune(input)
154 for i, r := range runes {
155 if (r == '(' || r == ')') && (i == 0 || runes[i-1] != '\\') {
Akroncc25e932025-06-02 19:39:43 +0200156 // Check if this parenthesis is inside brackets (part of an identifier)
157 insideBrackets := false
158 bracketDepth := 0
159 for j := 0; j < i; j++ {
160 if runes[j] == '[' {
161 bracketDepth++
162 } else if runes[j] == ']' {
163 bracketDepth--
164 }
165 }
166 insideBrackets = bracketDepth > 0
167
168 if !insideBrackets {
169 result = append(result, ' ', r, ' ')
170 } else {
171 result = append(result, r)
172 }
Akron76b87972025-06-02 16:59:59 +0200173 } else {
174 result = append(result, r)
175 }
176 }
Akron6b4c9eb2025-07-03 14:31:58 +0200177 return strings.TrimSpace(string(result))
178}
Akronbb5065f2025-05-21 12:44:05 +0200179
Akron6b4c9eb2025-07-03 14:31:58 +0200180// ParseMapping parses a mapping rule string into a MappingResult
181func (p *GrammarParser) ParseMapping(input string) (*MappingResult, error) {
182 input = p.preprocessInput(input)
Akronbb5065f2025-05-21 12:44:05 +0200183
184 grammar, err := p.mappingParser.ParseString("", input)
185 if err != nil {
186 return nil, fmt.Errorf("failed to parse grammar: %w", err)
187 }
188
189 if grammar.Mapping == nil {
190 return nil, fmt.Errorf("expected mapping rule, got token expression")
191 }
192
193 upper, err := p.parseExpr(grammar.Mapping.Upper.Expr)
194 if err != nil {
195 return nil, err
196 }
197
198 lower, err := p.parseExpr(grammar.Mapping.Lower.Expr)
199 if err != nil {
200 return nil, err
201 }
202
203 return &MappingResult{
204 Upper: &ast.Token{Wrap: upper},
205 Lower: &ast.Token{Wrap: lower},
206 }, nil
207}
208
209// MappingResult represents the parsed mapping rule
210type MappingResult struct {
211 Upper *ast.Token
212 Lower *ast.Token
213}
214
Akron22322ec2025-05-21 11:17:30 +0200215// parseExpr builds the AST from the parsed Expr
216func (p *GrammarParser) parseExpr(expr *Expr) (ast.Node, error) {
217 var operands []ast.Node
218 var operators []string
219
220 // Parse the first term
221 first, err := p.parseTerm(expr.First)
222 if err != nil {
223 return nil, err
224 }
225 operands = append(operands, first)
226
227 // Parse the rest
228 for _, op := range expr.Rest {
229 node, err := p.parseTerm(op.Term)
230 if err != nil {
231 return nil, err
232 }
233 operands = append(operands, node)
234 operators = append(operators, op.Operator)
235 }
236
237 // If only one operand, return it
238 if len(operands) == 1 {
239 return operands[0], nil
240 }
241
242 // Group operands by operator precedence (left-to-right, no precedence between & and |)
243 // We'll group by runs of the same operator
244 var groupOperands []ast.Node
245 var currentOp string
246 var currentGroup []ast.Node
247 for i, op := range operators {
248 if i == 0 {
249 currentOp = op
250 currentGroup = append(currentGroup, operands[i])
251 }
252 if op == currentOp {
253 currentGroup = append(currentGroup, operands[i+1])
254 } else {
255 groupOperands = append(groupOperands, &ast.TermGroup{
256 Operands: append([]ast.Node{}, currentGroup...),
257 Relation: toRelation(currentOp),
258 })
259 currentOp = op
260 currentGroup = []ast.Node{operands[i+1]}
261 }
262 }
263 if len(currentGroup) > 0 {
264 groupOperands = append(groupOperands, &ast.TermGroup{
265 Operands: append([]ast.Node{}, currentGroup...),
266 Relation: toRelation(currentOp),
267 })
268 }
269 if len(groupOperands) == 1 {
270 return groupOperands[0], nil
271 }
272 // If mixed operators, nest them left-to-right
273 result := groupOperands[0]
274 for i := 1; i < len(groupOperands); i++ {
275 result = &ast.TermGroup{
276 Operands: []ast.Node{result, groupOperands[i]},
277 Relation: toRelation(operators[0]),
278 }
279 }
280 return result, nil
281}
282
283// parseTerm converts a Term into an AST node
284func (p *GrammarParser) parseTerm(term *Term) (ast.Node, error) {
285 if term.Simple != nil {
286 return p.parseSimpleTerm(term.Simple)
287 }
288 if term.Paren != nil {
289 return p.parseExpr(term.Paren.Expr)
290 }
291 return nil, fmt.Errorf("invalid term: neither simple nor parenthesized")
292}
293
294func toRelation(op string) ast.RelationType {
295 if op == "|" {
296 return ast.OrRelation
297 }
298 return ast.AndRelation
299}
300
Akron121c66e2025-06-02 16:34:05 +0200301// unescapeString handles unescaping of backslash-escaped characters
302func unescapeString(s string) string {
303 if s == "" {
304 return s
305 }
306
Akronfc9f9872025-06-03 15:41:10 +0200307 // Modify string in-place by overwriting characters
308 bytes := []byte(s)
309 j := 0
310 for i := 0; i < len(bytes); i++ {
311 if bytes[i] == '\\' && i+1 < len(bytes) {
312 // Skip backslash and copy next char
313 bytes[j] = bytes[i+1]
Akron121c66e2025-06-02 16:34:05 +0200314 i++
Akronfc9f9872025-06-03 15:41:10 +0200315 } else {
316 // Copy current char
317 bytes[j] = bytes[i]
Akron121c66e2025-06-02 16:34:05 +0200318 }
Akronfc9f9872025-06-03 15:41:10 +0200319 j++
Akron121c66e2025-06-02 16:34:05 +0200320 }
Akronfc9f9872025-06-03 15:41:10 +0200321 return string(bytes[:j])
Akron121c66e2025-06-02 16:34:05 +0200322}
323
Akron22322ec2025-05-21 11:17:30 +0200324// parseSimpleTerm converts a SimpleTerm into an AST Term node
325func (p *GrammarParser) parseSimpleTerm(term *SimpleTerm) (ast.Node, error) {
326 var foundry, layer, key, value string
327
328 switch {
329 case term.WithFoundryLayer != nil:
Akron121c66e2025-06-02 16:34:05 +0200330 foundry = unescapeString(term.WithFoundryLayer.Foundry)
331 layer = unescapeString(term.WithFoundryLayer.Layer)
332 key = unescapeString(term.WithFoundryLayer.Key)
333 value = unescapeString(term.WithFoundryLayer.Value)
Akroncc25e932025-06-02 19:39:43 +0200334 case term.WithFoundryWildcard != nil:
335 foundry = unescapeString(term.WithFoundryWildcard.Foundry)
336 key = unescapeString(term.WithFoundryWildcard.Key)
Akron22322ec2025-05-21 11:17:30 +0200337 case term.WithFoundryKey != nil:
Akron121c66e2025-06-02 16:34:05 +0200338 foundry = unescapeString(term.WithFoundryKey.Foundry)
339 key = unescapeString(term.WithFoundryKey.Key)
Akron22322ec2025-05-21 11:17:30 +0200340 case term.WithLayer != nil:
Akroncc25e932025-06-02 19:39:43 +0200341 // Special case: if LayerTerm was parsed but the layer doesn't match the default layer,
342 // treat it as a key=value pattern instead
343 parsedLayer := unescapeString(term.WithLayer.Layer)
344 parsedKey := unescapeString(term.WithLayer.Key)
345 parsedValue := unescapeString(term.WithLayer.Value)
346
347 if p.defaultLayer != "" && parsedLayer == p.defaultLayer {
348 // This is a genuine layer=key pattern when the layer matches the default
349 layer = parsedLayer
350 key = parsedKey
351 value = parsedValue
352 } else if p.defaultLayer != "" && parsedLayer != p.defaultLayer {
353 // This should be treated as key=value pattern when there's a default layer but it doesn't match
354 key = parsedLayer
355 value = parsedKey
356 } else {
357 // No default layer context, treat as genuine layer=key pattern
358 layer = parsedLayer
359 key = parsedKey
360 value = parsedValue
361 }
Akron22322ec2025-05-21 11:17:30 +0200362 case term.SimpleKey != nil:
Akron121c66e2025-06-02 16:34:05 +0200363 key = unescapeString(term.SimpleKey.Key)
364 value = unescapeString(term.SimpleKey.Value)
Akron22322ec2025-05-21 11:17:30 +0200365 default:
366 return nil, fmt.Errorf("invalid term: no valid form found")
367 }
368
369 if foundry == "" {
370 foundry = p.defaultFoundry
371 }
372 if layer == "" {
373 layer = p.defaultLayer
374 }
375
376 return &ast.Term{
377 Foundry: foundry,
378 Key: key,
379 Layer: layer,
380 Match: ast.MatchEqual,
381 Value: value,
382 }, nil
383}