Implemented snippet_matcher to parse snippets and match against mapping lists
Change-Id: I79c7736dbfcc9f6ca61486e9211647974f011bd3
diff --git a/go.mod b/go.mod
index c166c3b..1a51b70 100644
--- a/go.mod
+++ b/go.mod
@@ -21,6 +21,7 @@
github.com/mattn/go-colorable v0.1.14 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.16 // indirect
+ github.com/orisano/gosax v1.1.2 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rivo/uniseg v0.4.7 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
diff --git a/go.sum b/go.sum
index a793d95..91fdbb9 100644
--- a/go.sum
+++ b/go.sum
@@ -29,6 +29,8 @@
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
+github.com/orisano/gosax v1.1.2 h1:5tx4vxxTjxlW8zEQ3AtmagrcQjGrikVg+4J2uvlArqQ=
+github.com/orisano/gosax v1.1.2/go.mod h1:mw6A5jIOFDeVOqffQkggKOOjRFevYnLyXgiZP06fRjI=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
diff --git a/matcher/snippet_matcher.go b/matcher/snippet_matcher.go
new file mode 100644
index 0000000..35e9461
--- /dev/null
+++ b/matcher/snippet_matcher.go
@@ -0,0 +1,282 @@
+package matcher
+
+import (
+ "fmt"
+ "sort"
+ "strings"
+
+ "github.com/KorAP/KoralPipe-TermMapper/ast"
+ "github.com/KorAP/KoralPipe-TermMapper/parser"
+ "github.com/orisano/gosax"
+)
+
+// TokenSpan represents a token and its position in the snippet
+type TokenSpan struct {
+ Text string // The actual token text
+ StartPos int // Character position where the token starts
+ EndPos int // Character position where the token ends
+ Annotations []string // All title attributes that annotate this token
+}
+
+// SnippetMatcher extends the basic matcher to work with HTML/XML snippets
+type SnippetMatcher struct {
+ matcher *Matcher
+ titleParser *parser.TitleAttributeParser
+}
+
+// NewSnippetMatcher creates a new snippet matcher
+func NewSnippetMatcher(pattern ast.Pattern, replacement ast.Replacement) (*SnippetMatcher, error) {
+ matcher, err := NewMatcher(pattern, replacement)
+ if err != nil {
+ return nil, fmt.Errorf("failed to create base matcher: %w", err)
+ }
+
+ return &SnippetMatcher{
+ matcher: matcher,
+ titleParser: parser.NewTitleAttributeParser(),
+ }, nil
+}
+
+// ParseSnippet parses an HTML/XML snippet and extracts tokens with their annotations
+func (sm *SnippetMatcher) ParseSnippet(snippet string) ([]TokenSpan, error) {
+ tokens := make([]TokenSpan, 0)
+
+ // Stack to track nested spans and their annotations
+ type spanInfo struct {
+ title string
+ level int
+ }
+ spanStack := make([]spanInfo, 0)
+
+ // Current position tracking
+ var currentPos int
+
+ reader := strings.NewReader(snippet)
+ r := gosax.NewReader(reader)
+
+ for {
+ e, err := r.Event()
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse snippet: %w", err)
+ }
+
+ if e.Type() == 8 { // gosax.EventEOF
+ break
+ }
+
+ switch e.Type() {
+ case 1: // gosax.EventStart
+ // Parse start element
+ startElem, err := gosax.StartElement(e.Bytes)
+ if err != nil {
+ continue // Skip invalid elements
+ }
+
+ if startElem.Name.Local == "span" {
+ // Look for title attribute
+ var title string
+ for _, attr := range startElem.Attr {
+ if attr.Name.Local == "title" {
+ title = attr.Value
+ break
+ }
+ }
+ spanStack = append(spanStack, spanInfo{title: title, level: len(spanStack)})
+ }
+
+ case 2: // gosax.EventEnd
+ // Parse end element
+ endElem := gosax.EndElement(e.Bytes)
+ if endElem.Name.Local == "span" && len(spanStack) > 0 {
+ spanStack = spanStack[:len(spanStack)-1]
+ }
+
+ case 3: // gosax.EventText
+ // Process character data
+ charData, err := gosax.CharData(e.Bytes)
+ if err != nil {
+ continue
+ }
+
+ text := string(charData)
+ trimmed := strings.TrimSpace(text)
+ if trimmed != "" && len(spanStack) > 0 {
+ // Only create tokens if we're inside at least one span
+ // Collect all annotations from the current span stack
+ annotations := make([]string, 0)
+ for _, span := range spanStack {
+ if span.title != "" {
+ annotations = append(annotations, span.title)
+ }
+ }
+
+ // Create token span
+ token := TokenSpan{
+ Text: trimmed,
+ StartPos: currentPos,
+ EndPos: currentPos + len(trimmed),
+ Annotations: annotations,
+ }
+ tokens = append(tokens, token)
+ }
+ currentPos += len(text)
+ }
+ }
+
+ // Sort tokens by start position to ensure proper order
+ sort.Slice(tokens, func(i, j int) bool {
+ return tokens[i].StartPos < tokens[j].StartPos
+ })
+
+ return tokens, nil
+}
+
+// CheckToken checks if a token's annotations match the pattern
+func (sm *SnippetMatcher) CheckToken(token TokenSpan) (bool, error) {
+ if len(token.Annotations) == 0 {
+ return false, nil
+ }
+
+ // Parse all annotations into AST terms
+ terms, err := sm.titleParser.ParseTitleAttributesToTerms(token.Annotations)
+ if err != nil {
+ return false, fmt.Errorf("failed to parse token annotations: %w", err)
+ }
+
+ if len(terms) == 0 {
+ return false, nil
+ }
+
+ // Create a TermGroup with AND relation for all annotations
+ var nodeToMatch ast.Node
+ if len(terms) == 1 {
+ nodeToMatch = terms[0]
+ } else {
+ nodeToMatch = &ast.TermGroup{
+ Operands: terms,
+ Relation: ast.AndRelation,
+ }
+ }
+
+ // Check if the constructed node matches our pattern
+ return sm.matcher.Match(nodeToMatch), nil
+}
+
+// CheckTokenSequence checks if a sequence of tokens matches the pattern
+func (sm *SnippetMatcher) CheckTokenSequence(tokens []TokenSpan) (bool, error) {
+ if len(tokens) == 0 {
+ return false, nil
+ }
+
+ // For token sequences, we need to check different strategies:
+ // 1. Check if any individual token matches
+ // 2. Check if the combined annotations of all tokens match
+
+ // Strategy 1: Check individual tokens
+ for _, token := range tokens {
+ matches, err := sm.CheckToken(token)
+ if err != nil {
+ return false, err
+ }
+ if matches {
+ return true, nil
+ }
+ }
+
+ // Strategy 2: Check combined annotations
+ allAnnotations := make([]string, 0)
+ for _, token := range tokens {
+ allAnnotations = append(allAnnotations, token.Annotations...)
+ }
+
+ // Remove duplicates from combined annotations
+ annotationMap := make(map[string]bool)
+ uniqueAnnotations := make([]string, 0)
+ for _, annotation := range allAnnotations {
+ if !annotationMap[annotation] {
+ annotationMap[annotation] = true
+ uniqueAnnotations = append(uniqueAnnotations, annotation)
+ }
+ }
+
+ if len(uniqueAnnotations) == 0 {
+ return false, nil
+ }
+
+ // Create a combined token for checking
+ combinedToken := TokenSpan{
+ Text: strings.Join(getTokenTexts(tokens), " "),
+ StartPos: tokens[0].StartPos,
+ EndPos: tokens[len(tokens)-1].EndPos,
+ Annotations: uniqueAnnotations,
+ }
+
+ return sm.CheckToken(combinedToken)
+}
+
+// FindMatchingTokens finds all tokens in the snippet that match the pattern
+func (sm *SnippetMatcher) FindMatchingTokens(snippet string) ([]TokenSpan, error) {
+ tokens, err := sm.ParseSnippet(snippet)
+ if err != nil {
+ return nil, err
+ }
+
+ matchingTokens := make([]TokenSpan, 0)
+
+ for _, token := range tokens {
+ matches, err := sm.CheckToken(token)
+ if err != nil {
+ return nil, fmt.Errorf("failed to check token '%s': %w", token.Text, err)
+ }
+ if matches {
+ matchingTokens = append(matchingTokens, token)
+ }
+ }
+
+ return matchingTokens, nil
+}
+
+// FindMatchingTokenSequences finds all token sequences that match the pattern
+func (sm *SnippetMatcher) FindMatchingTokenSequences(snippet string, maxSequenceLength int) ([][]TokenSpan, error) {
+ tokens, err := sm.ParseSnippet(snippet)
+ if err != nil {
+ return nil, err
+ }
+
+ if maxSequenceLength <= 0 {
+ maxSequenceLength = len(tokens)
+ }
+
+ matchingSequences := make([][]TokenSpan, 0)
+
+ // Check all possible token sequences up to maxSequenceLength
+ for start := 0; start < len(tokens); start++ {
+ for length := 1; length <= maxSequenceLength && start+length <= len(tokens); length++ {
+ sequence := tokens[start : start+length]
+
+ matches, err := sm.CheckTokenSequence(sequence)
+ if err != nil {
+ return nil, fmt.Errorf("failed to check token sequence: %w", err)
+ }
+ if matches {
+ matchingSequences = append(matchingSequences, sequence)
+ }
+ }
+ }
+
+ return matchingSequences, nil
+}
+
+// GetReplacement returns the replacement node from the matcher
+func (sm *SnippetMatcher) GetReplacement() ast.Node {
+ return sm.matcher.replacement.Root
+}
+
+// Helper function to extract token texts
+func getTokenTexts(tokens []TokenSpan) []string {
+ texts := make([]string, len(tokens))
+ for i, token := range tokens {
+ texts[i] = token.Text
+ }
+ return texts
+}
diff --git a/matcher/snippet_matcher_test.go b/matcher/snippet_matcher_test.go
new file mode 100644
index 0000000..f740d65
--- /dev/null
+++ b/matcher/snippet_matcher_test.go
@@ -0,0 +1,472 @@
+package matcher
+
+import (
+ "testing"
+
+ "github.com/KorAP/KoralPipe-TermMapper/ast"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func TestSnippetMatcher_ParseSnippet(t *testing.T) {
+ // Create a pattern for testing
+ pattern := ast.Pattern{
+ Root: &ast.Term{
+ Foundry: "marmot",
+ Layer: "m",
+ Key: "gender",
+ Value: "masc",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ replacement := ast.Replacement{
+ Root: &ast.Term{
+ Foundry: "opennlp",
+ Layer: "m",
+ Key: "M",
+ Value: "",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ sm, err := NewSnippetMatcher(pattern, replacement)
+ require.NoError(t, err)
+
+ tests := []struct {
+ name string
+ snippet string
+ expectedTokens int
+ expectedContains []string
+ }{
+ {
+ name: "Simple single token",
+ snippet: `<span title="corenlp/p:ART">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ <span title="marmot/m:number:sg">
+ <span title="marmot/p:ART">
+ Der</span>
+ </span>
+ </span>
+ </span>
+ </span>`,
+ expectedTokens: 1,
+ expectedContains: []string{"Der"},
+ },
+ {
+ name: "Multiple tokens",
+ snippet: `<span title="corenlp/p:ART">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ Der</span>
+ </span>
+ </span>
+ <span title="corenlp/p:ADJA">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ alte</span>
+ </span>
+ </span>`,
+ expectedTokens: 2,
+ expectedContains: []string{"Der", "alte"},
+ },
+ {
+ name: "Real-world example from test",
+ snippet: `<span title="corenlp/p:ART">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ <span title="marmot/m:number:sg">
+ <span title="marmot/p:ART">
+ <span title="opennlp/p:ART">
+ <span title="tt/l:die">
+ <span title="tt/p:ART">Der</span>
+ </span>
+ </span>
+ </span>
+ </span>
+ </span>
+ </span>
+ </span>`,
+ expectedTokens: 1,
+ expectedContains: []string{"Der"},
+ },
+ {
+ name: "Empty snippet",
+ snippet: "",
+ expectedTokens: 0,
+ expectedContains: []string{},
+ },
+ {
+ name: "No span elements",
+ snippet: "Just some text",
+ expectedTokens: 0,
+ expectedContains: []string{},
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ tokens, err := sm.ParseSnippet(tt.snippet)
+ require.NoError(t, err)
+
+ assert.Len(t, tokens, tt.expectedTokens)
+
+ for i, expectedText := range tt.expectedContains {
+ if i < len(tokens) {
+ assert.Equal(t, expectedText, tokens[i].Text)
+ }
+ }
+ })
+ }
+}
+
+func TestSnippetMatcher_CheckToken(t *testing.T) {
+ // Create a pattern that matches tokens with marmot/m:gender=masc
+ pattern := ast.Pattern{
+ Root: &ast.Term{
+ Foundry: "marmot",
+ Layer: "m",
+ Key: "gender",
+ Value: "masc",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ replacement := ast.Replacement{
+ Root: &ast.Term{
+ Foundry: "opennlp",
+ Layer: "m",
+ Key: "M",
+ Value: "",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ sm, err := NewSnippetMatcher(pattern, replacement)
+ require.NoError(t, err)
+
+ tests := []struct {
+ name string
+ token TokenSpan
+ shouldMatch bool
+ }{
+ {
+ name: "Token with matching annotation",
+ token: TokenSpan{
+ Text: "Der",
+ Annotations: []string{
+ "corenlp/p:ART",
+ "marmot/m:case:nom",
+ "marmot/m:gender:masc",
+ "marmot/m:number:sg",
+ },
+ },
+ shouldMatch: true,
+ },
+ {
+ name: "Token without matching annotation",
+ token: TokenSpan{
+ Text: "und",
+ Annotations: []string{
+ "corenlp/p:KON",
+ "marmot/p:KON",
+ "opennlp/p:KON",
+ },
+ },
+ shouldMatch: false,
+ },
+ {
+ name: "Token with no annotations",
+ token: TokenSpan{
+ Text: "text",
+ Annotations: []string{},
+ },
+ shouldMatch: false,
+ },
+ {
+ name: "Token with different gender value",
+ token: TokenSpan{
+ Text: "andere",
+ Annotations: []string{
+ "marmot/m:gender:fem",
+ "marmot/m:case:nom",
+ },
+ },
+ shouldMatch: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ matches, err := sm.CheckToken(tt.token)
+ require.NoError(t, err)
+ assert.Equal(t, tt.shouldMatch, matches)
+ })
+ }
+}
+
+func TestSnippetMatcher_FindMatchingTokens(t *testing.T) {
+ // Create a pattern that matches tokens with marmot/m:gender=masc
+ pattern := ast.Pattern{
+ Root: &ast.Term{
+ Foundry: "marmot",
+ Layer: "m",
+ Key: "gender",
+ Value: "masc",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ replacement := ast.Replacement{
+ Root: &ast.Term{
+ Foundry: "opennlp",
+ Layer: "m",
+ Key: "M",
+ Value: "",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ sm, err := NewSnippetMatcher(pattern, replacement)
+ require.NoError(t, err)
+
+ // Test snippet with mixed tokens - some matching, some not
+ snippet := `<span title="corenlp/p:ART">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ <span title="marmot/m:number:sg">
+ Der</span>
+ </span>
+ </span>
+ </span>
+ <span title="corenlp/p:ADJA">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ alte</span>
+ </span>
+ </span>
+ <span title="corenlp/p:NN">
+ <span title="marmot/m:case:nom">
+ <span title="marmot/m:gender:masc">
+ Baum</span>
+ </span>
+ </span>
+ <span title="corenlp/p:KON">
+ <span title="marmot/p:KON">
+ und</span>
+ </span>`
+
+ matchingTokens, err := sm.FindMatchingTokens(snippet)
+ require.NoError(t, err)
+
+ // Should find 3 matching tokens: "Der", "alte", "Baum" (all with gender:masc)
+ // but not "und" (no gender annotation)
+ assert.Len(t, matchingTokens, 3)
+
+ expectedTexts := []string{"Der", "alte", "Baum"}
+ for i, token := range matchingTokens {
+ assert.Equal(t, expectedTexts[i], token.Text)
+
+ // Verify that each token has the required annotation
+ hasGenderMasc := false
+ for _, annotation := range token.Annotations {
+ if annotation == "marmot/m:gender:masc" {
+ hasGenderMasc = true
+ break
+ }
+ }
+ assert.True(t, hasGenderMasc, "Token %s should have marmot/m:gender:masc annotation", token.Text)
+ }
+}
+
+func TestSnippetMatcher_CheckTokenSequence(t *testing.T) {
+ // Create a pattern for testing
+ pattern := ast.Pattern{
+ Root: &ast.Term{
+ Foundry: "marmot",
+ Layer: "m",
+ Key: "gender",
+ Value: "masc",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ replacement := ast.Replacement{
+ Root: &ast.Term{
+ Foundry: "opennlp",
+ Layer: "m",
+ Key: "M",
+ Value: "",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ sm, err := NewSnippetMatcher(pattern, replacement)
+ require.NoError(t, err)
+
+ tests := []struct {
+ name string
+ tokens []TokenSpan
+ shouldMatch bool
+ }{
+ {
+ name: "Sequence with matching token",
+ tokens: []TokenSpan{
+ {
+ Text: "Der",
+ Annotations: []string{
+ "marmot/m:gender:masc",
+ "marmot/m:case:nom",
+ },
+ },
+ {
+ Text: "alte",
+ Annotations: []string{
+ "marmot/m:gender:fem",
+ "marmot/m:case:nom",
+ },
+ },
+ },
+ shouldMatch: true, // First token matches
+ },
+ {
+ name: "Sequence with no matching tokens",
+ tokens: []TokenSpan{
+ {
+ Text: "und",
+ Annotations: []string{
+ "marmot/p:KON",
+ },
+ },
+ {
+ Text: "oder",
+ Annotations: []string{
+ "marmot/p:KON",
+ },
+ },
+ },
+ shouldMatch: false,
+ },
+ {
+ name: "Empty sequence",
+ tokens: []TokenSpan{},
+ shouldMatch: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ matches, err := sm.CheckTokenSequence(tt.tokens)
+ require.NoError(t, err)
+ assert.Equal(t, tt.shouldMatch, matches)
+ })
+ }
+}
+
+func TestSnippetMatcher_GetReplacement(t *testing.T) {
+ pattern := ast.Pattern{
+ Root: &ast.Term{
+ Foundry: "marmot",
+ Layer: "m",
+ Key: "gender",
+ Value: "masc",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ replacement := ast.Replacement{
+ Root: &ast.Term{
+ Foundry: "opennlp",
+ Layer: "m",
+ Key: "M",
+ Value: "",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ sm, err := NewSnippetMatcher(pattern, replacement)
+ require.NoError(t, err)
+
+ replacementNode := sm.GetReplacement()
+ require.NotNil(t, replacementNode)
+
+ term, ok := replacementNode.(*ast.Term)
+ require.True(t, ok)
+ assert.Equal(t, "opennlp", term.Foundry)
+ assert.Equal(t, "m", term.Layer)
+ assert.Equal(t, "M", term.Key)
+}
+
+func TestSnippetMatcher_RealWorldExample(t *testing.T) {
+ // Test with the real-world example from the response test
+ pattern := ast.Pattern{
+ Root: &ast.Term{
+ Foundry: "marmot",
+ Layer: "m",
+ Key: "gender",
+ Value: "masc",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ replacement := ast.Replacement{
+ Root: &ast.Term{
+ Foundry: "opennlp",
+ Layer: "m",
+ Key: "M",
+ Value: "",
+ Match: ast.MatchEqual,
+ },
+ }
+
+ sm, err := NewSnippetMatcher(pattern, replacement)
+ require.NoError(t, err)
+
+ // Real snippet from the test file
+ snippet := `<span title="corenlp/p:ART">` +
+ `<span title="marmot/m:case:nom">` +
+ `<span title="marmot/m:gender:masc">` +
+ `<span title="marmot/m:number:sg">` +
+ `<span title="marmot/p:ART">` +
+ `<span title="opennlp/p:ART">` +
+ `<span title="tt/l:die">` +
+ `<span title="tt/p:ART">Der</span>` +
+ `</span>` +
+ `</span>` +
+ `</span>` +
+ `</span>` +
+ `</span>` +
+ `</span>` +
+ `</span>`
+
+ // Parse the snippet
+ tokens, err := sm.ParseSnippet(snippet)
+ require.NoError(t, err)
+ require.Len(t, tokens, 1)
+
+ token := tokens[0]
+ assert.Equal(t, "Der", token.Text)
+
+ // Check that it has all expected annotations
+ expectedAnnotations := []string{
+ "corenlp/p:ART",
+ "marmot/m:case:nom",
+ "marmot/m:gender:masc",
+ "marmot/m:number:sg",
+ "marmot/p:ART",
+ "opennlp/p:ART",
+ "tt/l:die",
+ "tt/p:ART",
+ }
+
+ assert.Len(t, token.Annotations, len(expectedAnnotations))
+ for _, expected := range expectedAnnotations {
+ assert.Contains(t, token.Annotations, expected)
+ }
+
+ // Check that it matches our pattern
+ matches, err := sm.CheckToken(token)
+ require.NoError(t, err)
+ assert.True(t, matches)
+}