Implemented snippet_matcher to parse snippets and match against mapping lists

Change-Id: I79c7736dbfcc9f6ca61486e9211647974f011bd3
diff --git a/matcher/snippet_matcher_test.go b/matcher/snippet_matcher_test.go
new file mode 100644
index 0000000..f740d65
--- /dev/null
+++ b/matcher/snippet_matcher_test.go
@@ -0,0 +1,472 @@
+package matcher
+
+import (
+	"testing"
+
+	"github.com/KorAP/KoralPipe-TermMapper/ast"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestSnippetMatcher_ParseSnippet(t *testing.T) {
+	// Create a pattern for testing
+	pattern := ast.Pattern{
+		Root: &ast.Term{
+			Foundry: "marmot",
+			Layer:   "m",
+			Key:     "gender",
+			Value:   "masc",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	replacement := ast.Replacement{
+		Root: &ast.Term{
+			Foundry: "opennlp",
+			Layer:   "m",
+			Key:     "M",
+			Value:   "",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	sm, err := NewSnippetMatcher(pattern, replacement)
+	require.NoError(t, err)
+
+	tests := []struct {
+		name             string
+		snippet          string
+		expectedTokens   int
+		expectedContains []string
+	}{
+		{
+			name: "Simple single token",
+			snippet: `<span title="corenlp/p:ART">
+				<span title="marmot/m:case:nom">
+				<span title="marmot/m:gender:masc">
+				<span title="marmot/m:number:sg">
+				<span title="marmot/p:ART">
+				Der</span>
+				</span>
+				</span>
+				</span>
+				</span>`,
+			expectedTokens:   1,
+			expectedContains: []string{"Der"},
+		},
+		{
+			name: "Multiple tokens",
+			snippet: `<span title="corenlp/p:ART">
+				<span title="marmot/m:case:nom">
+				<span title="marmot/m:gender:masc">
+				Der</span>
+				</span>
+				</span> 
+				<span title="corenlp/p:ADJA">
+				<span title="marmot/m:case:nom">
+				<span title="marmot/m:gender:masc">
+				alte</span>
+				</span>
+				</span>`,
+			expectedTokens:   2,
+			expectedContains: []string{"Der", "alte"},
+		},
+		{
+			name: "Real-world example from test",
+			snippet: `<span title="corenlp/p:ART">
+				<span title="marmot/m:case:nom">
+				<span title="marmot/m:gender:masc">
+				<span title="marmot/m:number:sg">
+				<span title="marmot/p:ART">
+				<span title="opennlp/p:ART">
+				<span title="tt/l:die">
+				<span title="tt/p:ART">Der</span>
+				</span>
+				</span>
+				</span>
+				</span>
+				</span>
+				</span>
+				</span>`,
+			expectedTokens:   1,
+			expectedContains: []string{"Der"},
+		},
+		{
+			name:             "Empty snippet",
+			snippet:          "",
+			expectedTokens:   0,
+			expectedContains: []string{},
+		},
+		{
+			name:             "No span elements",
+			snippet:          "Just some text",
+			expectedTokens:   0,
+			expectedContains: []string{},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tokens, err := sm.ParseSnippet(tt.snippet)
+			require.NoError(t, err)
+
+			assert.Len(t, tokens, tt.expectedTokens)
+
+			for i, expectedText := range tt.expectedContains {
+				if i < len(tokens) {
+					assert.Equal(t, expectedText, tokens[i].Text)
+				}
+			}
+		})
+	}
+}
+
+func TestSnippetMatcher_CheckToken(t *testing.T) {
+	// Create a pattern that matches tokens with marmot/m:gender=masc
+	pattern := ast.Pattern{
+		Root: &ast.Term{
+			Foundry: "marmot",
+			Layer:   "m",
+			Key:     "gender",
+			Value:   "masc",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	replacement := ast.Replacement{
+		Root: &ast.Term{
+			Foundry: "opennlp",
+			Layer:   "m",
+			Key:     "M",
+			Value:   "",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	sm, err := NewSnippetMatcher(pattern, replacement)
+	require.NoError(t, err)
+
+	tests := []struct {
+		name        string
+		token       TokenSpan
+		shouldMatch bool
+	}{
+		{
+			name: "Token with matching annotation",
+			token: TokenSpan{
+				Text: "Der",
+				Annotations: []string{
+					"corenlp/p:ART",
+					"marmot/m:case:nom",
+					"marmot/m:gender:masc",
+					"marmot/m:number:sg",
+				},
+			},
+			shouldMatch: true,
+		},
+		{
+			name: "Token without matching annotation",
+			token: TokenSpan{
+				Text: "und",
+				Annotations: []string{
+					"corenlp/p:KON",
+					"marmot/p:KON",
+					"opennlp/p:KON",
+				},
+			},
+			shouldMatch: false,
+		},
+		{
+			name: "Token with no annotations",
+			token: TokenSpan{
+				Text:        "text",
+				Annotations: []string{},
+			},
+			shouldMatch: false,
+		},
+		{
+			name: "Token with different gender value",
+			token: TokenSpan{
+				Text: "andere",
+				Annotations: []string{
+					"marmot/m:gender:fem",
+					"marmot/m:case:nom",
+				},
+			},
+			shouldMatch: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			matches, err := sm.CheckToken(tt.token)
+			require.NoError(t, err)
+			assert.Equal(t, tt.shouldMatch, matches)
+		})
+	}
+}
+
+func TestSnippetMatcher_FindMatchingTokens(t *testing.T) {
+	// Create a pattern that matches tokens with marmot/m:gender=masc
+	pattern := ast.Pattern{
+		Root: &ast.Term{
+			Foundry: "marmot",
+			Layer:   "m",
+			Key:     "gender",
+			Value:   "masc",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	replacement := ast.Replacement{
+		Root: &ast.Term{
+			Foundry: "opennlp",
+			Layer:   "m",
+			Key:     "M",
+			Value:   "",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	sm, err := NewSnippetMatcher(pattern, replacement)
+	require.NoError(t, err)
+
+	// Test snippet with mixed tokens - some matching, some not
+	snippet := `<span title="corenlp/p:ART">
+		<span title="marmot/m:case:nom">
+		<span title="marmot/m:gender:masc">
+		<span title="marmot/m:number:sg">
+		Der</span>
+		</span>
+		</span>
+		</span> 
+		<span title="corenlp/p:ADJA">
+		<span title="marmot/m:case:nom">
+		<span title="marmot/m:gender:masc">
+		alte</span>
+		</span>
+		</span> 
+		<span title="corenlp/p:NN">
+		<span title="marmot/m:case:nom">
+		<span title="marmot/m:gender:masc">
+		Baum</span>
+		</span>
+		</span> 
+		<span title="corenlp/p:KON">
+		<span title="marmot/p:KON">
+		und</span>
+		</span>`
+
+	matchingTokens, err := sm.FindMatchingTokens(snippet)
+	require.NoError(t, err)
+
+	// Should find 3 matching tokens: "Der", "alte", "Baum" (all with gender:masc)
+	// but not "und" (no gender annotation)
+	assert.Len(t, matchingTokens, 3)
+
+	expectedTexts := []string{"Der", "alte", "Baum"}
+	for i, token := range matchingTokens {
+		assert.Equal(t, expectedTexts[i], token.Text)
+
+		// Verify that each token has the required annotation
+		hasGenderMasc := false
+		for _, annotation := range token.Annotations {
+			if annotation == "marmot/m:gender:masc" {
+				hasGenderMasc = true
+				break
+			}
+		}
+		assert.True(t, hasGenderMasc, "Token %s should have marmot/m:gender:masc annotation", token.Text)
+	}
+}
+
+func TestSnippetMatcher_CheckTokenSequence(t *testing.T) {
+	// Create a pattern for testing
+	pattern := ast.Pattern{
+		Root: &ast.Term{
+			Foundry: "marmot",
+			Layer:   "m",
+			Key:     "gender",
+			Value:   "masc",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	replacement := ast.Replacement{
+		Root: &ast.Term{
+			Foundry: "opennlp",
+			Layer:   "m",
+			Key:     "M",
+			Value:   "",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	sm, err := NewSnippetMatcher(pattern, replacement)
+	require.NoError(t, err)
+
+	tests := []struct {
+		name        string
+		tokens      []TokenSpan
+		shouldMatch bool
+	}{
+		{
+			name: "Sequence with matching token",
+			tokens: []TokenSpan{
+				{
+					Text: "Der",
+					Annotations: []string{
+						"marmot/m:gender:masc",
+						"marmot/m:case:nom",
+					},
+				},
+				{
+					Text: "alte",
+					Annotations: []string{
+						"marmot/m:gender:fem",
+						"marmot/m:case:nom",
+					},
+				},
+			},
+			shouldMatch: true, // First token matches
+		},
+		{
+			name: "Sequence with no matching tokens",
+			tokens: []TokenSpan{
+				{
+					Text: "und",
+					Annotations: []string{
+						"marmot/p:KON",
+					},
+				},
+				{
+					Text: "oder",
+					Annotations: []string{
+						"marmot/p:KON",
+					},
+				},
+			},
+			shouldMatch: false,
+		},
+		{
+			name:        "Empty sequence",
+			tokens:      []TokenSpan{},
+			shouldMatch: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			matches, err := sm.CheckTokenSequence(tt.tokens)
+			require.NoError(t, err)
+			assert.Equal(t, tt.shouldMatch, matches)
+		})
+	}
+}
+
+func TestSnippetMatcher_GetReplacement(t *testing.T) {
+	pattern := ast.Pattern{
+		Root: &ast.Term{
+			Foundry: "marmot",
+			Layer:   "m",
+			Key:     "gender",
+			Value:   "masc",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	replacement := ast.Replacement{
+		Root: &ast.Term{
+			Foundry: "opennlp",
+			Layer:   "m",
+			Key:     "M",
+			Value:   "",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	sm, err := NewSnippetMatcher(pattern, replacement)
+	require.NoError(t, err)
+
+	replacementNode := sm.GetReplacement()
+	require.NotNil(t, replacementNode)
+
+	term, ok := replacementNode.(*ast.Term)
+	require.True(t, ok)
+	assert.Equal(t, "opennlp", term.Foundry)
+	assert.Equal(t, "m", term.Layer)
+	assert.Equal(t, "M", term.Key)
+}
+
+func TestSnippetMatcher_RealWorldExample(t *testing.T) {
+	// Test with the real-world example from the response test
+	pattern := ast.Pattern{
+		Root: &ast.Term{
+			Foundry: "marmot",
+			Layer:   "m",
+			Key:     "gender",
+			Value:   "masc",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	replacement := ast.Replacement{
+		Root: &ast.Term{
+			Foundry: "opennlp",
+			Layer:   "m",
+			Key:     "M",
+			Value:   "",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	sm, err := NewSnippetMatcher(pattern, replacement)
+	require.NoError(t, err)
+
+	// Real snippet from the test file
+	snippet := `<span title="corenlp/p:ART">` +
+		`<span title="marmot/m:case:nom">` +
+		`<span title="marmot/m:gender:masc">` +
+		`<span title="marmot/m:number:sg">` +
+		`<span title="marmot/p:ART">` +
+		`<span title="opennlp/p:ART">` +
+		`<span title="tt/l:die">` +
+		`<span title="tt/p:ART">Der</span>` +
+		`</span>` +
+		`</span>` +
+		`</span>` +
+		`</span>` +
+		`</span>` +
+		`</span>` +
+		`</span>`
+
+	// Parse the snippet
+	tokens, err := sm.ParseSnippet(snippet)
+	require.NoError(t, err)
+	require.Len(t, tokens, 1)
+
+	token := tokens[0]
+	assert.Equal(t, "Der", token.Text)
+
+	// Check that it has all expected annotations
+	expectedAnnotations := []string{
+		"corenlp/p:ART",
+		"marmot/m:case:nom",
+		"marmot/m:gender:masc",
+		"marmot/m:number:sg",
+		"marmot/p:ART",
+		"opennlp/p:ART",
+		"tt/l:die",
+		"tt/p:ART",
+	}
+
+	assert.Len(t, token.Annotations, len(expectedAnnotations))
+	for _, expected := range expectedAnnotations {
+		assert.Contains(t, token.Annotations, expected)
+	}
+
+	// Check that it matches our pattern
+	matches, err := sm.CheckToken(token)
+	require.NoError(t, err)
+	assert.True(t, matches)
+}