Add title attribute parser for HTML span annotations

Change-Id: I14d89e20301ad30857774a06cb1cd7c5e466dab5
diff --git a/parser/title_parser.go b/parser/title_parser.go
new file mode 100644
index 0000000..b6c13bf
--- /dev/null
+++ b/parser/title_parser.go
@@ -0,0 +1,102 @@
+package parser
+
+import (
+	"fmt"
+	"regexp"
+
+	"github.com/KorAP/KoralPipe-TermMapper/ast"
+)
+
+// TitleAttribute represents a parsed title attribute from an HTML span
+type TitleAttribute struct {
+	Foundry string
+	Layer   string
+	Key     string
+	Value   string
+}
+
+// TitleAttributeParser parses title attributes from HTML span elements
+type TitleAttributeParser struct {
+	regex *regexp.Regexp
+}
+
+// NewTitleAttributeParser creates a new title attribute parser
+func NewTitleAttributeParser() *TitleAttributeParser {
+	// Single regex that captures: foundry/layer:key or foundry/layer:key[:=]value
+	// Groups: 1=foundry, 2=layer, 3=key, 4=value (optional)
+	regex := regexp.MustCompile(`^([^/]+)/([^:]+):([^:=]+)(?:[:=](.+))?$`)
+	return &TitleAttributeParser{
+		regex: regex,
+	}
+}
+
+// ParseTitleAttribute parses a single title attribute string
+// Expects format: "foundry/layer:key" or "foundry/layer:key[:=]value"
+func (p *TitleAttributeParser) ParseTitleAttribute(title string) (*TitleAttribute, error) {
+	if title == "" {
+		return nil, fmt.Errorf("empty title attribute")
+	}
+
+	matches := p.regex.FindStringSubmatch(title)
+	if matches == nil {
+		return nil, fmt.Errorf("invalid title format: '%s'", title)
+	}
+
+	foundry := matches[1]
+	layer := matches[2]
+	key := matches[3]
+	value := ""
+	if len(matches) > 4 && matches[4] != "" {
+		value = matches[4]
+	}
+
+	return &TitleAttribute{
+		Foundry: foundry,
+		Layer:   layer,
+		Key:     key,
+		Value:   value,
+	}, nil
+}
+
+// ParseTitleAttributesToTerms converts title attributes to AST Term nodes
+func (p *TitleAttributeParser) ParseTitleAttributesToTerms(titles []string) ([]ast.Node, error) {
+	terms := make([]ast.Node, 0) // Initialize as empty slice instead of nil
+
+	for _, title := range titles {
+		attr, err := p.ParseTitleAttribute(title)
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse title '%s': %w", title, err)
+		}
+
+		term := &ast.Term{
+			Foundry: attr.Foundry,
+			Layer:   attr.Layer,
+			Key:     attr.Key,
+			Value:   attr.Value,
+			Match:   ast.MatchEqual,
+		}
+
+		terms = append(terms, term)
+	}
+
+	return terms, nil
+}
+
+// ToAST converts a TitleAttribute to an AST Term node
+func (attr *TitleAttribute) ToAST() ast.Node {
+	return &ast.Term{
+		Foundry: attr.Foundry,
+		Layer:   attr.Layer,
+		Key:     attr.Key,
+		Value:   attr.Value,
+		Match:   ast.MatchEqual,
+	}
+}
+
+// String returns a string representation of the title attribute
+func (attr *TitleAttribute) String() string {
+	if attr.Value != "" {
+		return fmt.Sprintf("%s/%s:%s=%s", attr.Foundry, attr.Layer, attr.Key, attr.Value)
+	}
+	return fmt.Sprintf("%s/%s:%s", attr.Foundry, attr.Layer, attr.Key)
+}
diff --git a/parser/title_parser_test.go b/parser/title_parser_test.go
new file mode 100644
index 0000000..3c82a59
--- /dev/null
+++ b/parser/title_parser_test.go
@@ -0,0 +1,331 @@
+package parser
+
+import (
+	"testing"
+
+	"github.com/KorAP/KoralPipe-TermMapper/ast"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestTitleAttributeParser_ParseTitleAttribute(t *testing.T) {
+	parser := NewTitleAttributeParser()
+
+	tests := []struct {
+		name     string
+		input    string
+		expected *TitleAttribute
+		wantErr  bool
+	}{
+		{
+			name:  "Parse simple title with key only",
+			input: "corenlp/p:ART",
+			expected: &TitleAttribute{
+				Foundry: "corenlp",
+				Layer:   "p",
+				Key:     "ART",
+				Value:   "",
+			},
+			wantErr: false,
+		},
+		{
+			name:  "Parse title with key and value",
+			input: "marmot/m:case=nom",
+			expected: &TitleAttribute{
+				Foundry: "marmot",
+				Layer:   "m",
+				Key:     "case",
+				Value:   "nom",
+			},
+			wantErr: false,
+		},
+		{
+			name:  "Parse title with colon separator for value",
+			input: "marmot/m:gender:masc",
+			expected: &TitleAttribute{
+				Foundry: "marmot",
+				Layer:   "m",
+				Key:     "gender",
+				Value:   "masc",
+			},
+			wantErr: false,
+		},
+		{
+			name:  "Parse title with equals separator for value",
+			input: "marmot/m:degree=pos",
+			expected: &TitleAttribute{
+				Foundry: "marmot",
+				Layer:   "m",
+				Key:     "degree",
+				Value:   "pos",
+			},
+			wantErr: false,
+		},
+		{
+			name:  "Parse title with lemma layer",
+			input: "tt/l:die",
+			expected: &TitleAttribute{
+				Foundry: "tt",
+				Layer:   "l",
+				Key:     "die",
+				Value:   "",
+			},
+			wantErr: false,
+		},
+		{
+			name:  "Parse title with special characters in value",
+			input: "tt/l:@card@",
+			expected: &TitleAttribute{
+				Foundry: "tt",
+				Layer:   "l",
+				Key:     "@card@",
+				Value:   "",
+			},
+			wantErr: false,
+		},
+		{
+			name:    "Empty title should fail",
+			input:   "",
+			wantErr: true,
+		},
+		{
+			name:    "Missing foundry separator should fail",
+			input:   "corenlp_p:ART",
+			wantErr: true,
+		},
+		{
+			name:    "Missing layer separator should fail",
+			input:   "corenlp/p_ART",
+			wantErr: true,
+		},
+		{
+			name:    "Only foundry should fail",
+			input:   "corenlp",
+			wantErr: true,
+		},
+		{
+			name:    "Only foundry and layer should fail",
+			input:   "corenlp/p",
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := parser.ParseTitleAttribute(tt.input)
+
+			if tt.wantErr {
+				assert.Error(t, err)
+				assert.Nil(t, result)
+			} else {
+				require.NoError(t, err)
+				require.NotNil(t, result)
+				assert.Equal(t, tt.expected.Foundry, result.Foundry)
+				assert.Equal(t, tt.expected.Layer, result.Layer)
+				assert.Equal(t, tt.expected.Key, result.Key)
+				assert.Equal(t, tt.expected.Value, result.Value)
+			}
+		})
+	}
+}
+
+func TestTitleAttributeParser_ParseTitleAttributesToTerms(t *testing.T) {
+	parser := NewTitleAttributeParser()
+
+	tests := []struct {
+		name     string
+		input    []string
+		expected []ast.Node
+		wantErr  bool
+	}{
+		{
+			name:  "Parse multiple title attributes",
+			input: []string{"corenlp/p:ART", "marmot/m:case=nom", "tt/l:die"},
+			expected: []ast.Node{
+				&ast.Term{
+					Foundry: "corenlp",
+					Layer:   "p",
+					Key:     "ART",
+					Value:   "",
+					Match:   ast.MatchEqual,
+				},
+				&ast.Term{
+					Foundry: "marmot",
+					Layer:   "m",
+					Key:     "case",
+					Value:   "nom",
+					Match:   ast.MatchEqual,
+				},
+				&ast.Term{
+					Foundry: "tt",
+					Layer:   "l",
+					Key:     "die",
+					Value:   "",
+					Match:   ast.MatchEqual,
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name:     "Empty input should return empty slice",
+			input:    []string{},
+			expected: []ast.Node{},
+			wantErr:  false,
+		},
+		{
+			name:    "Invalid title should cause error",
+			input:   []string{"corenlp/p:ART", "invalid_title", "tt/l:die"},
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := parser.ParseTitleAttributesToTerms(tt.input)
+
+			if tt.wantErr {
+				assert.Error(t, err)
+			} else {
+				require.NoError(t, err)
+				require.Len(t, result, len(tt.expected))
+
+				for i, expectedTerm := range tt.expected {
+					expectedTermNode := expectedTerm.(*ast.Term)
+					actualTermNode := result[i].(*ast.Term)
+
+					assert.Equal(t, expectedTermNode.Foundry, actualTermNode.Foundry)
+					assert.Equal(t, expectedTermNode.Layer, actualTermNode.Layer)
+					assert.Equal(t, expectedTermNode.Key, actualTermNode.Key)
+					assert.Equal(t, expectedTermNode.Value, actualTermNode.Value)
+					assert.Equal(t, expectedTermNode.Match, actualTermNode.Match)
+				}
+			}
+		})
+	}
+}
+
+func TestTitleAttribute_ToAST(t *testing.T) {
+	tests := []struct {
+		name     string
+		attr     *TitleAttribute
+		expected *ast.Term
+	}{
+		{
+			name: "Convert title attribute to AST term",
+			attr: &TitleAttribute{
+				Foundry: "corenlp",
+				Layer:   "p",
+				Key:     "ART",
+				Value:   "",
+			},
+			expected: &ast.Term{
+				Foundry: "corenlp",
+				Layer:   "p",
+				Key:     "ART",
+				Value:   "",
+				Match:   ast.MatchEqual,
+			},
+		},
+		{
+			name: "Convert title attribute with value to AST term",
+			attr: &TitleAttribute{
+				Foundry: "marmot",
+				Layer:   "m",
+				Key:     "case",
+				Value:   "nom",
+			},
+			expected: &ast.Term{
+				Foundry: "marmot",
+				Layer:   "m",
+				Key:     "case",
+				Value:   "nom",
+				Match:   ast.MatchEqual,
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := tt.attr.ToAST()
+
+			termResult := result.(*ast.Term)
+			assert.Equal(t, tt.expected.Foundry, termResult.Foundry)
+			assert.Equal(t, tt.expected.Layer, termResult.Layer)
+			assert.Equal(t, tt.expected.Key, termResult.Key)
+			assert.Equal(t, tt.expected.Value, termResult.Value)
+			assert.Equal(t, tt.expected.Match, termResult.Match)
+		})
+	}
+}
+
+func TestTitleAttribute_String(t *testing.T) {
+	tests := []struct {
+		name     string
+		attr     *TitleAttribute
+		expected string
+	}{
+		{
+			name: "String representation without value",
+			attr: &TitleAttribute{
+				Foundry: "corenlp",
+				Layer:   "p",
+				Key:     "ART",
+				Value:   "",
+			},
+			expected: "corenlp/p:ART",
+		},
+		{
+			name: "String representation with value",
+			attr: &TitleAttribute{
+				Foundry: "marmot",
+				Layer:   "m",
+				Key:     "case",
+				Value:   "nom",
+			},
+			expected: "marmot/m:case=nom",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := tt.attr.String()
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
+
+func TestTitleAttributeParser_RealWorldExample(t *testing.T) {
+	parser := NewTitleAttributeParser()
+
+	// Example titles from the response test file
+	titles := []string{
+		"corenlp/p:ART",
+		"marmot/m:case=nom",
+		"marmot/m:gender=masc",
+		"marmot/m:number=sg",
+		"marmot/p:ART",
+		"opennlp/p:ART",
+		"tt/l:die",
+		"tt/p:ART",
+	}
+
+	// Parse each title attribute
+	for _, title := range titles {
+		attr, err := parser.ParseTitleAttribute(title)
+		require.NoError(t, err)
+		require.NotNil(t, attr)
+
+		// Verify the string representation matches
+		assert.Equal(t, title, attr.String())
+
+		// Verify conversion to AST works
+		astNode := attr.ToAST()
+		require.NotNil(t, astNode)
+
+		term := astNode.(*ast.Term)
+		assert.NotEmpty(t, term.Foundry)
+		assert.NotEmpty(t, term.Layer)
+		assert.NotEmpty(t, term.Key)
+		assert.Equal(t, ast.MatchEqual, term.Match)
+	}
+}