Implemented snippet_matcher to parse snippets and match against mapping lists
Change-Id: I79c7736dbfcc9f6ca61486e9211647974f011bd3
diff --git a/matcher/snippet_matcher_test.go b/matcher/snippet_matcher_test.go
new file mode 100644
index 0000000..f740d65
--- /dev/null
+++ b/matcher/snippet_matcher_test.go
@@ -0,0 +1,472 @@
+package matcher
+
+import (
+ "testing"
+
+ "github.com/KorAP/KoralPipe-TermMapper/ast"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func TestSnippetMatcher_ParseSnippet(t *testing.T) {
+ // Create a pattern for testing
+ pattern := ast.Pattern{
+ Root: &ast.Term{
+ Foundry: "marmot",
+ Layer: "m",
+ Key: "gender",
+ Value: "masc",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ replacement := ast.Replacement{
+ Root: &ast.Term{
+ Foundry: "opennlp",
+ Layer: "m",
+ Key: "M",
+ Value: "",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ sm, err := NewSnippetMatcher(pattern, replacement)
+ require.NoError(t, err)
+
+ tests := []struct {
+ name string
+ snippet string
+ expectedTokens int
+ expectedContains []string
+ }{
+ {
+ name: "Simple single token",
+ snippet: `<span title="corenlp/p:ART">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ <span title="marmot/m:number:sg">
+ <span title="marmot/p:ART">
+ Der</span>
+ </span>
+ </span>
+ </span>
+ </span>`,
+ expectedTokens: 1,
+ expectedContains: []string{"Der"},
+ },
+ {
+ name: "Multiple tokens",
+ snippet: `<span title="corenlp/p:ART">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ Der</span>
+ </span>
+ </span>
+ <span title="corenlp/p:ADJA">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ alte</span>
+ </span>
+ </span>`,
+ expectedTokens: 2,
+ expectedContains: []string{"Der", "alte"},
+ },
+ {
+ name: "Real-world example from test",
+ snippet: `<span title="corenlp/p:ART">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ <span title="marmot/m:number:sg">
+ <span title="marmot/p:ART">
+ <span title="opennlp/p:ART">
+ <span title="tt/l:die">
+ <span title="tt/p:ART">Der</span>
+ </span>
+ </span>
+ </span>
+ </span>
+ </span>
+ </span>
+ </span>`,
+ expectedTokens: 1,
+ expectedContains: []string{"Der"},
+ },
+ {
+ name: "Empty snippet",
+ snippet: "",
+ expectedTokens: 0,
+ expectedContains: []string{},
+ },
+ {
+ name: "No span elements",
+ snippet: "Just some text",
+ expectedTokens: 0,
+ expectedContains: []string{},
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ tokens, err := sm.ParseSnippet(tt.snippet)
+ require.NoError(t, err)
+
+ assert.Len(t, tokens, tt.expectedTokens)
+
+ for i, expectedText := range tt.expectedContains {
+ if i < len(tokens) {
+ assert.Equal(t, expectedText, tokens[i].Text)
+ }
+ }
+ })
+ }
+}
+
+func TestSnippetMatcher_CheckToken(t *testing.T) {
+ // Create a pattern that matches tokens with marmot/m:gender=masc
+ pattern := ast.Pattern{
+ Root: &ast.Term{
+ Foundry: "marmot",
+ Layer: "m",
+ Key: "gender",
+ Value: "masc",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ replacement := ast.Replacement{
+ Root: &ast.Term{
+ Foundry: "opennlp",
+ Layer: "m",
+ Key: "M",
+ Value: "",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ sm, err := NewSnippetMatcher(pattern, replacement)
+ require.NoError(t, err)
+
+ tests := []struct {
+ name string
+ token TokenSpan
+ shouldMatch bool
+ }{
+ {
+ name: "Token with matching annotation",
+ token: TokenSpan{
+ Text: "Der",
+ Annotations: []string{
+ "corenlp/p:ART",
+ "marmot/m:case:nom",
+ "marmot/m:gender:masc",
+ "marmot/m:number:sg",
+ },
+ },
+ shouldMatch: true,
+ },
+ {
+ name: "Token without matching annotation",
+ token: TokenSpan{
+ Text: "und",
+ Annotations: []string{
+ "corenlp/p:KON",
+ "marmot/p:KON",
+ "opennlp/p:KON",
+ },
+ },
+ shouldMatch: false,
+ },
+ {
+ name: "Token with no annotations",
+ token: TokenSpan{
+ Text: "text",
+ Annotations: []string{},
+ },
+ shouldMatch: false,
+ },
+ {
+ name: "Token with different gender value",
+ token: TokenSpan{
+ Text: "andere",
+ Annotations: []string{
+ "marmot/m:gender:fem",
+ "marmot/m:case:nom",
+ },
+ },
+ shouldMatch: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ matches, err := sm.CheckToken(tt.token)
+ require.NoError(t, err)
+ assert.Equal(t, tt.shouldMatch, matches)
+ })
+ }
+}
+
+func TestSnippetMatcher_FindMatchingTokens(t *testing.T) {
+ // Create a pattern that matches tokens with marmot/m:gender=masc
+ pattern := ast.Pattern{
+ Root: &ast.Term{
+ Foundry: "marmot",
+ Layer: "m",
+ Key: "gender",
+ Value: "masc",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ replacement := ast.Replacement{
+ Root: &ast.Term{
+ Foundry: "opennlp",
+ Layer: "m",
+ Key: "M",
+ Value: "",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ sm, err := NewSnippetMatcher(pattern, replacement)
+ require.NoError(t, err)
+
+ // Test snippet with mixed tokens - some matching, some not
+ snippet := `<span title="corenlp/p:ART">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ <span title="marmot/m:number:sg">
+ Der</span>
+ </span>
+ </span>
+ </span>
+ <span title="corenlp/p:ADJA">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ alte</span>
+ </span>
+ </span>
+ <span title="corenlp/p:NN">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ Baum</span>
+ </span>
+ </span>
+ <span title="corenlp/p:KON">
+ <span title="marmot/p:KON">
+ und</span>
+ </span>`
+
+ matchingTokens, err := sm.FindMatchingTokens(snippet)
+ require.NoError(t, err)
+
+ // Should find 3 matching tokens: "Der", "alte", "Baum" (all with gender:masc)
+ // but not "und" (no gender annotation)
+ assert.Len(t, matchingTokens, 3)
+
+ expectedTexts := []string{"Der", "alte", "Baum"}
+ for i, token := range matchingTokens {
+ assert.Equal(t, expectedTexts[i], token.Text)
+
+ // Verify that each token has the required annotation
+ hasGenderMasc := false
+ for _, annotation := range token.Annotations {
+ if annotation == "marmot/m:gender:masc" {
+ hasGenderMasc = true
+ break
+ }
+ }
+ assert.True(t, hasGenderMasc, "Token %s should have marmot/m:gender:masc annotation", token.Text)
+ }
+}
+
+func TestSnippetMatcher_CheckTokenSequence(t *testing.T) {
+ // Create a pattern for testing
+ pattern := ast.Pattern{
+ Root: &ast.Term{
+ Foundry: "marmot",
+ Layer: "m",
+ Key: "gender",
+ Value: "masc",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ replacement := ast.Replacement{
+ Root: &ast.Term{
+ Foundry: "opennlp",
+ Layer: "m",
+ Key: "M",
+ Value: "",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ sm, err := NewSnippetMatcher(pattern, replacement)
+ require.NoError(t, err)
+
+ tests := []struct {
+ name string
+ tokens []TokenSpan
+ shouldMatch bool
+ }{
+ {
+ name: "Sequence with matching token",
+ tokens: []TokenSpan{
+ {
+ Text: "Der",
+ Annotations: []string{
+ "marmot/m:gender:masc",
+ "marmot/m:case:nom",
+ },
+ },
+ {
+ Text: "alte",
+ Annotations: []string{
+ "marmot/m:gender:fem",
+ "marmot/m:case:nom",
+ },
+ },
+ },
+ shouldMatch: true, // First token matches
+ },
+ {
+ name: "Sequence with no matching tokens",
+ tokens: []TokenSpan{
+ {
+ Text: "und",
+ Annotations: []string{
+ "marmot/p:KON",
+ },
+ },
+ {
+ Text: "oder",
+ Annotations: []string{
+ "marmot/p:KON",
+ },
+ },
+ },
+ shouldMatch: false,
+ },
+ {
+ name: "Empty sequence",
+ tokens: []TokenSpan{},
+ shouldMatch: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ matches, err := sm.CheckTokenSequence(tt.tokens)
+ require.NoError(t, err)
+ assert.Equal(t, tt.shouldMatch, matches)
+ })
+ }
+}
+
+func TestSnippetMatcher_GetReplacement(t *testing.T) {
+ pattern := ast.Pattern{
+ Root: &ast.Term{
+ Foundry: "marmot",
+ Layer: "m",
+ Key: "gender",
+ Value: "masc",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ replacement := ast.Replacement{
+ Root: &ast.Term{
+ Foundry: "opennlp",
+ Layer: "m",
+ Key: "M",
+ Value: "",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ sm, err := NewSnippetMatcher(pattern, replacement)
+ require.NoError(t, err)
+
+ replacementNode := sm.GetReplacement()
+ require.NotNil(t, replacementNode)
+
+ term, ok := replacementNode.(*ast.Term)
+ require.True(t, ok)
+ assert.Equal(t, "opennlp", term.Foundry)
+ assert.Equal(t, "m", term.Layer)
+ assert.Equal(t, "M", term.Key)
+}
+
+func TestSnippetMatcher_RealWorldExample(t *testing.T) {
+ // Test with the real-world example from the response test
+ pattern := ast.Pattern{
+ Root: &ast.Term{
+ Foundry: "marmot",
+ Layer: "m",
+ Key: "gender",
+ Value: "masc",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ replacement := ast.Replacement{
+ Root: &ast.Term{
+ Foundry: "opennlp",
+ Layer: "m",
+ Key: "M",
+ Value: "",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ sm, err := NewSnippetMatcher(pattern, replacement)
+ require.NoError(t, err)
+
+ // Real snippet from the test file
+ snippet := `<span title="corenlp/p:ART">` +
+ `<span title="marmot/m:case:nom">` +
+ `<span title="marmot/m:gender:masc">` +
+ `<span title="marmot/m:number:sg">` +
+ `<span title="marmot/p:ART">` +
+ `<span title="opennlp/p:ART">` +
+ `<span title="tt/l:die">` +
+ `<span title="tt/p:ART">Der</span>` +
+ `</span>` +
+ `</span>` +
+ `</span>` +
+ `</span>` +
+ `</span>` +
+ `</span>` +
+ `</span>`
+
+ // Parse the snippet
+ tokens, err := sm.ParseSnippet(snippet)
+ require.NoError(t, err)
+ require.Len(t, tokens, 1)
+
+ token := tokens[0]
+ assert.Equal(t, "Der", token.Text)
+
+ // Check that it has all expected annotations
+ expectedAnnotations := []string{
+ "corenlp/p:ART",
+ "marmot/m:case:nom",
+ "marmot/m:gender:masc",
+ "marmot/m:number:sg",
+ "marmot/p:ART",
+ "opennlp/p:ART",
+ "tt/l:die",
+ "tt/p:ART",
+ }
+
+ assert.Len(t, token.Annotations, len(expectedAnnotations))
+ for _, expected := range expectedAnnotations {
+ assert.Contains(t, token.Annotations, expected)
+ }
+
+ // Check that it matches our pattern
+ matches, err := sm.CheckToken(token)
+ require.NoError(t, err)
+ assert.True(t, matches)
+}