blob: 3142f9fc490d195556325841bd9592bd061e283e [file] [log] [blame]
package matcher
import (
"testing"
"github.com/KorAP/Koral-Mapper/ast"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestSnippetMatcher_ParseSnippet(t *testing.T) {
// Create a pattern for testing
pattern := ast.Pattern{
Root: &ast.Term{
Foundry: "marmot",
Layer: "m",
Key: "gender",
Value: "masc",
Match: ast.MatchEqual,
},
}
replacement := ast.Replacement{
Root: &ast.Term{
Foundry: "opennlp",
Layer: "m",
Key: "M",
Value: "",
Match: ast.MatchEqual,
},
}
sm, err := NewSnippetMatcher(pattern, replacement)
require.NoError(t, err)
tests := []struct {
name string
snippet string
expectedTokens int
expectedContains []string
}{
{
name: "Simple single token",
snippet: `<span title="corenlp/p:ART">
<span title="marmot/m:case:nom">
<span title="marmot/m:gender:masc">
<span title="marmot/m:number:sg">
<span title="marmot/p:ART">
Der</span>
</span>
</span>
</span>
</span>`,
expectedTokens: 1,
expectedContains: []string{"Der"},
},
{
name: "Multiple tokens",
snippet: `<span title="corenlp/p:ART">
<span title="marmot/m:case:nom">
<span title="marmot/m:gender:masc">
Der</span>
</span>
</span>
<span title="corenlp/p:ADJA">
<span title="marmot/m:case:nom">
<span title="marmot/m:gender:masc">
alte</span>
</span>
</span>`,
expectedTokens: 2,
expectedContains: []string{"Der", "alte"},
},
{
name: "Real-world example from test",
snippet: `<span title="corenlp/p:ART">
<span title="marmot/m:case:nom">
<span title="marmot/m:gender:masc">
<span title="marmot/m:number:sg">
<span title="marmot/p:ART">
<span title="opennlp/p:ART">
<span title="tt/l:die">
<span title="tt/p:ART">Der</span>
</span>
</span>
</span>
</span>
</span>
</span>
</span>`,
expectedTokens: 1,
expectedContains: []string{"Der"},
},
{
name: "Empty snippet",
snippet: "",
expectedTokens: 0,
expectedContains: []string{},
},
{
name: "No span elements",
snippet: "Just some text",
expectedTokens: 0,
expectedContains: []string{},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tokens, err := sm.ParseSnippet(tt.snippet)
require.NoError(t, err)
assert.Len(t, tokens, tt.expectedTokens)
for i, expectedText := range tt.expectedContains {
if i < len(tokens) {
assert.Equal(t, expectedText, tokens[i].Text)
}
}
})
}
}
func TestSnippetMatcher_CheckToken(t *testing.T) {
// Create a pattern that matches tokens with marmot/m:gender=masc
pattern := ast.Pattern{
Root: &ast.Term{
Foundry: "marmot",
Layer: "m",
Key: "gender",
Value: "masc",
Match: ast.MatchEqual,
},
}
replacement := ast.Replacement{
Root: &ast.Term{
Foundry: "opennlp",
Layer: "m",
Key: "M",
Value: "",
Match: ast.MatchEqual,
},
}
sm, err := NewSnippetMatcher(pattern, replacement)
require.NoError(t, err)
tests := []struct {
name string
token TokenSpan
shouldMatch bool
}{
{
name: "Token with matching annotation",
token: TokenSpan{
Text: "Der",
Annotations: []string{
"corenlp/p:ART",
"marmot/m:case:nom",
"marmot/m:gender:masc",
"marmot/m:number:sg",
},
},
shouldMatch: true,
},
{
name: "Token without matching annotation",
token: TokenSpan{
Text: "und",
Annotations: []string{
"corenlp/p:KON",
"marmot/p:KON",
"opennlp/p:KON",
},
},
shouldMatch: false,
},
{
name: "Token with no annotations",
token: TokenSpan{
Text: "text",
Annotations: []string{},
},
shouldMatch: false,
},
{
name: "Token with different gender value",
token: TokenSpan{
Text: "andere",
Annotations: []string{
"marmot/m:gender:fem",
"marmot/m:case:nom",
},
},
shouldMatch: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
matches, err := sm.CheckToken(tt.token)
require.NoError(t, err)
assert.Equal(t, tt.shouldMatch, matches)
})
}
}
func TestSnippetMatcher_FindMatchingTokens(t *testing.T) {
// Create a pattern that matches tokens with marmot/m:gender=masc
pattern := ast.Pattern{
Root: &ast.Term{
Foundry: "marmot",
Layer: "m",
Key: "gender",
Value: "masc",
Match: ast.MatchEqual,
},
}
replacement := ast.Replacement{
Root: &ast.Term{
Foundry: "opennlp",
Layer: "m",
Key: "M",
Value: "",
Match: ast.MatchEqual,
},
}
sm, err := NewSnippetMatcher(pattern, replacement)
require.NoError(t, err)
// Test snippet with mixed tokens - some matching, some not
snippet := `<span title="corenlp/p:ART">
<span title="marmot/m:case:nom">
<span title="marmot/m:gender:masc">
<span title="marmot/m:number:sg">
Der</span>
</span>
</span>
</span>
<span title="corenlp/p:ADJA">
<span title="marmot/m:case:nom">
<span title="marmot/m:gender:masc">
alte</span>
</span>
</span>
<span title="corenlp/p:NN">
<span title="marmot/m:case:nom">
<span title="marmot/m:gender:masc">
Baum</span>
</span>
</span>
<span title="corenlp/p:KON">
<span title="marmot/p:KON">
und</span>
</span>`
matchingTokens, err := sm.FindMatchingTokens(snippet)
require.NoError(t, err)
// Should find 3 matching tokens: "Der", "alte", "Baum" (all with gender:masc)
// but not "und" (no gender annotation)
assert.Len(t, matchingTokens, 3)
expectedTexts := []string{"Der", "alte", "Baum"}
for i, token := range matchingTokens {
assert.Equal(t, expectedTexts[i], token.Text)
// Verify that each token has the required annotation
hasGenderMasc := false
for _, annotation := range token.Annotations {
if annotation == "marmot/m:gender:masc" {
hasGenderMasc = true
break
}
}
assert.True(t, hasGenderMasc, "Token %s should have marmot/m:gender:masc annotation", token.Text)
}
}
func TestSnippetMatcher_RealWorldExample(t *testing.T) {
// Test with the real-world example from the response test
pattern := ast.Pattern{
Root: &ast.Term{
Foundry: "marmot",
Layer: "m",
Key: "gender",
Value: "masc",
Match: ast.MatchEqual,
},
}
replacement := ast.Replacement{
Root: &ast.Term{
Foundry: "opennlp",
Layer: "m",
Key: "M",
Value: "",
Match: ast.MatchEqual,
},
}
sm, err := NewSnippetMatcher(pattern, replacement)
require.NoError(t, err)
// Real snippet from the test file
snippet := `<span title="corenlp/p:ART">` +
`<span title="marmot/m:case:nom">` +
`<span title="marmot/m:gender:masc">` +
`<span title="marmot/m:number:sg">` +
`<span title="marmot/p:ART">` +
`<span title="opennlp/p:ART">` +
`<span title="tt/l:die">` +
`<span title="tt/p:ART">Der</span>` +
`</span>` +
`</span>` +
`</span>` +
`</span>` +
`</span>` +
`</span>` +
`</span>`
// Parse the snippet
tokens, err := sm.ParseSnippet(snippet)
require.NoError(t, err)
require.Len(t, tokens, 1)
token := tokens[0]
assert.Equal(t, "Der", token.Text)
// Check that it has all expected annotations
expectedAnnotations := []string{
"corenlp/p:ART",
"marmot/m:case:nom",
"marmot/m:gender:masc",
"marmot/m:number:sg",
"marmot/p:ART",
"opennlp/p:ART",
"tt/l:die",
"tt/p:ART",
}
assert.Len(t, token.Annotations, len(expectedAnnotations))
for _, expected := range expectedAnnotations {
assert.Contains(t, token.Annotations, expected)
}
// Check that it matches our pattern
matches, err := sm.CheckToken(token)
require.NoError(t, err)
assert.True(t, matches)
}