Add title attribute parser for HTML span annotations
Change-Id: I14d89e20301ad30857774a06cb1cd7c5e466dab5
diff --git a/parser/title_parser.go b/parser/title_parser.go
new file mode 100644
index 0000000..b6c13bf
--- /dev/null
+++ b/parser/title_parser.go
@@ -0,0 +1,102 @@
+package parser
+
+import (
+ "fmt"
+ "regexp"
+
+ "github.com/KorAP/KoralPipe-TermMapper/ast"
+)
+
+// TitleAttribute represents a parsed title attribute from an HTML span
+type TitleAttribute struct {
+ Foundry string
+ Layer string
+ Key string
+ Value string
+}
+
+// TitleAttributeParser parses title attributes from HTML span elements
+type TitleAttributeParser struct {
+ regex *regexp.Regexp
+}
+
+// NewTitleAttributeParser creates a new title attribute parser
+func NewTitleAttributeParser() *TitleAttributeParser {
+ // Single regex that captures: foundry/layer:key or foundry/layer:key[:=]value
+ // Groups: 1=foundry, 2=layer, 3=key, 4=value (optional)
+ regex := regexp.MustCompile(`^([^/]+)/([^:]+):([^:=]+)(?:[:=](.+))?$`)
+ return &TitleAttributeParser{
+ regex: regex,
+ }
+}
+
+// ParseTitleAttribute parses a single title attribute string
+// Expects format: "foundry/layer:key" or "foundry/layer:key[:=]value"
+func (p *TitleAttributeParser) ParseTitleAttribute(title string) (*TitleAttribute, error) {
+ if title == "" {
+ return nil, fmt.Errorf("empty title attribute")
+ }
+
+ matches := p.regex.FindStringSubmatch(title)
+ if matches == nil {
+ return nil, fmt.Errorf("invalid title format: '%s'", title)
+ }
+
+ foundry := matches[1]
+ layer := matches[2]
+ key := matches[3]
+ value := ""
+ if len(matches) > 4 && matches[4] != "" {
+ value = matches[4]
+ }
+
+ return &TitleAttribute{
+ Foundry: foundry,
+ Layer: layer,
+ Key: key,
+ Value: value,
+ }, nil
+}
+
+// ParseTitleAttributesToTerms converts title attributes to AST Term nodes
+func (p *TitleAttributeParser) ParseTitleAttributesToTerms(titles []string) ([]ast.Node, error) {
+ terms := make([]ast.Node, 0) // Initialize as empty slice instead of nil
+
+ for _, title := range titles {
+ attr, err := p.ParseTitleAttribute(title)
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse title '%s': %w", title, err)
+ }
+
+ term := &ast.Term{
+ Foundry: attr.Foundry,
+ Layer: attr.Layer,
+ Key: attr.Key,
+ Value: attr.Value,
+ Match: ast.MatchEqual,
+ }
+
+ terms = append(terms, term)
+ }
+
+ return terms, nil
+}
+
+// ToAST converts a TitleAttribute to an AST Term node
+func (attr *TitleAttribute) ToAST() ast.Node {
+ return &ast.Term{
+ Foundry: attr.Foundry,
+ Layer: attr.Layer,
+ Key: attr.Key,
+ Value: attr.Value,
+ Match: ast.MatchEqual,
+ }
+}
+
+// String returns a string representation of the title attribute
+func (attr *TitleAttribute) String() string {
+ if attr.Value != "" {
+ return fmt.Sprintf("%s/%s:%s=%s", attr.Foundry, attr.Layer, attr.Key, attr.Value)
+ }
+ return fmt.Sprintf("%s/%s:%s", attr.Foundry, attr.Layer, attr.Key)
+}