blob: 87bd809d1fe8147157e06aa3cd5caf646b56ee40 [file] [log] [blame]
Akrone562ad62025-06-25 11:15:56 +02001package parser
2
3import (
4 "fmt"
5 "regexp"
6
7 "github.com/KorAP/KoralPipe-TermMapper/ast"
8)
9
10// TitleAttribute represents a parsed title attribute from an HTML span
11type TitleAttribute struct {
12 Foundry string
13 Layer string
14 Key string
15 Value string
16}
17
18// TitleAttributeParser parses title attributes from HTML span elements
19type TitleAttributeParser struct {
20 regex *regexp.Regexp
21}
22
23// NewTitleAttributeParser creates a new title attribute parser
24func NewTitleAttributeParser() *TitleAttributeParser {
25 // Single regex that captures: foundry/layer:key or foundry/layer:key[:=]value
26 // Groups: 1=foundry, 2=layer, 3=key, 4=value (optional)
Akron7de5b612025-06-26 16:15:03 +020027 regex := regexp.MustCompile(`^([^/]+)/([^:]+):([^:]+)(?::(.+))?$`)
Akrone562ad62025-06-25 11:15:56 +020028 return &TitleAttributeParser{
29 regex: regex,
30 }
31}
32
Akron6b4c9eb2025-07-03 14:31:58 +020033// parseTitleAttribute parses a single title attribute string
Akrone562ad62025-06-25 11:15:56 +020034// Expects format: "foundry/layer:key" or "foundry/layer:key[:=]value"
Akron6b4c9eb2025-07-03 14:31:58 +020035func (p *TitleAttributeParser) parseTitleAttribute(title string) (*TitleAttribute, error) {
Akrone562ad62025-06-25 11:15:56 +020036 if title == "" {
37 return nil, fmt.Errorf("empty title attribute")
38 }
39
40 matches := p.regex.FindStringSubmatch(title)
41 if matches == nil {
42 return nil, fmt.Errorf("invalid title format: '%s'", title)
43 }
44
45 foundry := matches[1]
46 layer := matches[2]
47 key := matches[3]
48 value := ""
49 if len(matches) > 4 && matches[4] != "" {
50 value = matches[4]
51 }
52
53 return &TitleAttribute{
54 Foundry: foundry,
55 Layer: layer,
56 Key: key,
57 Value: value,
58 }, nil
59}
60
61// ParseTitleAttributesToTerms converts title attributes to AST Term nodes
62func (p *TitleAttributeParser) ParseTitleAttributesToTerms(titles []string) ([]ast.Node, error) {
63 terms := make([]ast.Node, 0) // Initialize as empty slice instead of nil
64
65 for _, title := range titles {
Akron6b4c9eb2025-07-03 14:31:58 +020066 attr, err := p.parseTitleAttribute(title)
Akrone562ad62025-06-25 11:15:56 +020067 if err != nil {
68 return nil, fmt.Errorf("failed to parse title '%s': %w", title, err)
69 }
70
71 term := &ast.Term{
72 Foundry: attr.Foundry,
73 Layer: attr.Layer,
74 Key: attr.Key,
75 Value: attr.Value,
76 Match: ast.MatchEqual,
77 }
78
79 terms = append(terms, term)
80 }
81
82 return terms, nil
83}