blob: 053ba5980c74ff80a70a834194506ddd6a8500c8 [file] [log] [blame]
package parser
import (
"fmt"
"regexp"
"github.com/KorAP/Koral-Mapper/ast"
)
// TitleAttribute represents a parsed title attribute from an HTML span
type TitleAttribute struct {
Foundry string
Layer string
Key string
Value string
}
// TitleAttributeParser parses title attributes from HTML span elements
type TitleAttributeParser struct {
regex *regexp.Regexp
}
// NewTitleAttributeParser creates a new title attribute parser
func NewTitleAttributeParser() *TitleAttributeParser {
// Single regex that captures: foundry/layer:key or foundry/layer:key[:=]value
// Groups: 1=foundry, 2=layer, 3=key, 4=value (optional)
regex := regexp.MustCompile(`^([^/]+)/([^:]+):([^:]+)(?::(.+))?$`)
return &TitleAttributeParser{
regex: regex,
}
}
// parseTitleAttribute parses a single title attribute string
// Expects format: "foundry/layer:key" or "foundry/layer:key[:=]value"
func (p *TitleAttributeParser) parseTitleAttribute(title string) (*TitleAttribute, error) {
if title == "" {
return nil, fmt.Errorf("empty title attribute")
}
matches := p.regex.FindStringSubmatch(title)
if matches == nil {
return nil, fmt.Errorf("invalid title format: '%s'", title)
}
foundry := matches[1]
layer := matches[2]
key := matches[3]
value := ""
if len(matches) > 4 && matches[4] != "" {
value = matches[4]
}
return &TitleAttribute{
Foundry: foundry,
Layer: layer,
Key: key,
Value: value,
}, nil
}
// ParseTitleAttributesToTerms converts title attributes to AST Term nodes
func (p *TitleAttributeParser) ParseTitleAttributesToTerms(titles []string) ([]ast.Node, error) {
terms := make([]ast.Node, 0) // Initialize as empty slice instead of nil
for _, title := range titles {
attr, err := p.parseTitleAttribute(title)
if err != nil {
return nil, fmt.Errorf("failed to parse title '%s': %w", title, err)
}
term := &ast.Term{
Foundry: attr.Foundry,
Layer: attr.Layer,
Key: attr.Key,
Value: attr.Value,
Match: ast.MatchEqual,
}
terms = append(terms, term)
}
return terms, nil
}