Akron | e562ad6 | 2025-06-25 11:15:56 +0200 | [diff] [blame] | 1 | package parser |
| 2 | |
| 3 | import ( |
| 4 | "fmt" |
| 5 | "regexp" |
| 6 | |
| 7 | "github.com/KorAP/KoralPipe-TermMapper/ast" |
| 8 | ) |
| 9 | |
| 10 | // TitleAttribute represents a parsed title attribute from an HTML span |
| 11 | type TitleAttribute struct { |
| 12 | Foundry string |
| 13 | Layer string |
| 14 | Key string |
| 15 | Value string |
| 16 | } |
| 17 | |
| 18 | // TitleAttributeParser parses title attributes from HTML span elements |
| 19 | type TitleAttributeParser struct { |
| 20 | regex *regexp.Regexp |
| 21 | } |
| 22 | |
| 23 | // NewTitleAttributeParser creates a new title attribute parser |
| 24 | func NewTitleAttributeParser() *TitleAttributeParser { |
| 25 | // Single regex that captures: foundry/layer:key or foundry/layer:key[:=]value |
| 26 | // Groups: 1=foundry, 2=layer, 3=key, 4=value (optional) |
Akron | 7de5b61 | 2025-06-26 16:15:03 +0200 | [diff] [blame] | 27 | regex := regexp.MustCompile(`^([^/]+)/([^:]+):([^:]+)(?::(.+))?$`) |
Akron | e562ad6 | 2025-06-25 11:15:56 +0200 | [diff] [blame] | 28 | return &TitleAttributeParser{ |
| 29 | regex: regex, |
| 30 | } |
| 31 | } |
| 32 | |
Akron | 6b4c9eb | 2025-07-03 14:31:58 +0200 | [diff] [blame] | 33 | // parseTitleAttribute parses a single title attribute string |
Akron | e562ad6 | 2025-06-25 11:15:56 +0200 | [diff] [blame] | 34 | // Expects format: "foundry/layer:key" or "foundry/layer:key[:=]value" |
Akron | 6b4c9eb | 2025-07-03 14:31:58 +0200 | [diff] [blame] | 35 | func (p *TitleAttributeParser) parseTitleAttribute(title string) (*TitleAttribute, error) { |
Akron | e562ad6 | 2025-06-25 11:15:56 +0200 | [diff] [blame] | 36 | if title == "" { |
| 37 | return nil, fmt.Errorf("empty title attribute") |
| 38 | } |
| 39 | |
| 40 | matches := p.regex.FindStringSubmatch(title) |
| 41 | if matches == nil { |
| 42 | return nil, fmt.Errorf("invalid title format: '%s'", title) |
| 43 | } |
| 44 | |
| 45 | foundry := matches[1] |
| 46 | layer := matches[2] |
| 47 | key := matches[3] |
| 48 | value := "" |
| 49 | if len(matches) > 4 && matches[4] != "" { |
| 50 | value = matches[4] |
| 51 | } |
| 52 | |
| 53 | return &TitleAttribute{ |
| 54 | Foundry: foundry, |
| 55 | Layer: layer, |
| 56 | Key: key, |
| 57 | Value: value, |
| 58 | }, nil |
| 59 | } |
| 60 | |
| 61 | // ParseTitleAttributesToTerms converts title attributes to AST Term nodes |
| 62 | func (p *TitleAttributeParser) ParseTitleAttributesToTerms(titles []string) ([]ast.Node, error) { |
| 63 | terms := make([]ast.Node, 0) // Initialize as empty slice instead of nil |
| 64 | |
| 65 | for _, title := range titles { |
Akron | 6b4c9eb | 2025-07-03 14:31:58 +0200 | [diff] [blame] | 66 | attr, err := p.parseTitleAttribute(title) |
Akron | e562ad6 | 2025-06-25 11:15:56 +0200 | [diff] [blame] | 67 | if err != nil { |
| 68 | return nil, fmt.Errorf("failed to parse title '%s': %w", title, err) |
| 69 | } |
| 70 | |
| 71 | term := &ast.Term{ |
| 72 | Foundry: attr.Foundry, |
| 73 | Layer: attr.Layer, |
| 74 | Key: attr.Key, |
| 75 | Value: attr.Value, |
| 76 | Match: ast.MatchEqual, |
| 77 | } |
| 78 | |
| 79 | terms = append(terms, term) |
| 80 | } |
| 81 | |
| 82 | return terms, nil |
| 83 | } |