Implemented snippet_matcher to parse snippets and match against mapping lists Change-Id: I79c7736dbfcc9f6ca61486e9211647974f011bd3

commit: cc9a8a6b7514821516c37033b0c1594e1198aa02 [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Jun 25 11:56:28 2025 +0200
committer: Akron <nils@diewald-online.de> Wed Jun 25 11:56:28 2025 +0200
tree: a95c4156e7de1d3c921c1f6f53008dfe0245d55d
parent: e562ad6cdc7f8118cc285f940aab82ab78e64ce4 [diff]
diff --git a/matcher/snippet_matcher.go b/matcher/snippet_matcher.go
new file mode 100644
index 0000000..35e9461
--- /dev/null
+++ b/matcher/snippet_matcher.go

@@ -0,0 +1,282 @@
+package matcher
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+
+	"github.com/KorAP/KoralPipe-TermMapper/ast"
+	"github.com/KorAP/KoralPipe-TermMapper/parser"
+	"github.com/orisano/gosax"
+)
+
+// TokenSpan represents a token and its position in the snippet
+type TokenSpan struct {
+	Text        string   // The actual token text
+	StartPos    int      // Character position where the token starts
+	EndPos      int      // Character position where the token ends
+	Annotations []string // All title attributes that annotate this token
+}
+
+// SnippetMatcher extends the basic matcher to work with HTML/XML snippets
+type SnippetMatcher struct {
+	matcher     *Matcher
+	titleParser *parser.TitleAttributeParser
+}
+
+// NewSnippetMatcher creates a new snippet matcher
+func NewSnippetMatcher(pattern ast.Pattern, replacement ast.Replacement) (*SnippetMatcher, error) {
+	matcher, err := NewMatcher(pattern, replacement)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create base matcher: %w", err)
+	}
+
+	return &SnippetMatcher{
+		matcher:     matcher,
+		titleParser: parser.NewTitleAttributeParser(),
+	}, nil
+}
+
+// ParseSnippet parses an HTML/XML snippet and extracts tokens with their annotations
+func (sm *SnippetMatcher) ParseSnippet(snippet string) ([]TokenSpan, error) {
+	tokens := make([]TokenSpan, 0)
+
+	// Stack to track nested spans and their annotations
+	type spanInfo struct {
+		title string
+		level int
+	}
+	spanStack := make([]spanInfo, 0)
+
+	// Current position tracking
+	var currentPos int
+
+	reader := strings.NewReader(snippet)
+	r := gosax.NewReader(reader)
+
+	for {
+		e, err := r.Event()
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse snippet: %w", err)
+		}
+
+		if e.Type() == 8 { // gosax.EventEOF
+			break
+		}
+
+		switch e.Type() {
+		case 1: // gosax.EventStart
+			// Parse start element
+			startElem, err := gosax.StartElement(e.Bytes)
+			if err != nil {
+				continue // Skip invalid elements
+			}
+
+			if startElem.Name.Local == "span" {
+				// Look for title attribute
+				var title string
+				for _, attr := range startElem.Attr {
+					if attr.Name.Local == "title" {
+						title = attr.Value
+						break
+					}
+				}
+				spanStack = append(spanStack, spanInfo{title: title, level: len(spanStack)})
+			}
+
+		case 2: // gosax.EventEnd
+			// Parse end element
+			endElem := gosax.EndElement(e.Bytes)
+			if endElem.Name.Local == "span" && len(spanStack) > 0 {
+				spanStack = spanStack[:len(spanStack)-1]
+			}
+
+		case 3: // gosax.EventText
+			// Process character data
+			charData, err := gosax.CharData(e.Bytes)
+			if err != nil {
+				continue
+			}
+
+			text := string(charData)
+			trimmed := strings.TrimSpace(text)
+			if trimmed != "" && len(spanStack) > 0 {
+				// Only create tokens if we're inside at least one span
+				// Collect all annotations from the current span stack
+				annotations := make([]string, 0)
+				for _, span := range spanStack {
+					if span.title != "" {
+						annotations = append(annotations, span.title)
+					}
+				}
+
+				// Create token span
+				token := TokenSpan{
+					Text:        trimmed,
+					StartPos:    currentPos,
+					EndPos:      currentPos + len(trimmed),
+					Annotations: annotations,
+				}
+				tokens = append(tokens, token)
+			}
+			currentPos += len(text)
+		}
+	}
+
+	// Sort tokens by start position to ensure proper order
+	sort.Slice(tokens, func(i, j int) bool {
+		return tokens[i].StartPos < tokens[j].StartPos
+	})
+
+	return tokens, nil
+}
+
+// CheckToken checks if a token's annotations match the pattern
+func (sm *SnippetMatcher) CheckToken(token TokenSpan) (bool, error) {
+	if len(token.Annotations) == 0 {
+		return false, nil
+	}
+
+	// Parse all annotations into AST terms
+	terms, err := sm.titleParser.ParseTitleAttributesToTerms(token.Annotations)
+	if err != nil {
+		return false, fmt.Errorf("failed to parse token annotations: %w", err)
+	}
+
+	if len(terms) == 0 {
+		return false, nil
+	}
+
+	// Create a TermGroup with AND relation for all annotations
+	var nodeToMatch ast.Node
+	if len(terms) == 1 {
+		nodeToMatch = terms[0]
+	} else {
+		nodeToMatch = &ast.TermGroup{
+			Operands: terms,
+			Relation: ast.AndRelation,
+		}
+	}
+
+	// Check if the constructed node matches our pattern
+	return sm.matcher.Match(nodeToMatch), nil
+}
+
+// CheckTokenSequence checks if a sequence of tokens matches the pattern
+func (sm *SnippetMatcher) CheckTokenSequence(tokens []TokenSpan) (bool, error) {
+	if len(tokens) == 0 {
+		return false, nil
+	}
+
+	// For token sequences, we need to check different strategies:
+	// 1. Check if any individual token matches
+	// 2. Check if the combined annotations of all tokens match
+
+	// Strategy 1: Check individual tokens
+	for _, token := range tokens {
+		matches, err := sm.CheckToken(token)
+		if err != nil {
+			return false, err
+		}
+		if matches {
+			return true, nil
+		}
+	}
+
+	// Strategy 2: Check combined annotations
+	allAnnotations := make([]string, 0)
+	for _, token := range tokens {
+		allAnnotations = append(allAnnotations, token.Annotations...)
+	}
+
+	// Remove duplicates from combined annotations
+	annotationMap := make(map[string]bool)
+	uniqueAnnotations := make([]string, 0)
+	for _, annotation := range allAnnotations {
+		if !annotationMap[annotation] {
+			annotationMap[annotation] = true
+			uniqueAnnotations = append(uniqueAnnotations, annotation)
+		}
+	}
+
+	if len(uniqueAnnotations) == 0 {
+		return false, nil
+	}
+
+	// Create a combined token for checking
+	combinedToken := TokenSpan{
+		Text:        strings.Join(getTokenTexts(tokens), " "),
+		StartPos:    tokens[0].StartPos,
+		EndPos:      tokens[len(tokens)-1].EndPos,
+		Annotations: uniqueAnnotations,
+	}
+
+	return sm.CheckToken(combinedToken)
+}
+
+// FindMatchingTokens finds all tokens in the snippet that match the pattern
+func (sm *SnippetMatcher) FindMatchingTokens(snippet string) ([]TokenSpan, error) {
+	tokens, err := sm.ParseSnippet(snippet)
+	if err != nil {
+		return nil, err
+	}
+
+	matchingTokens := make([]TokenSpan, 0)
+
+	for _, token := range tokens {
+		matches, err := sm.CheckToken(token)
+		if err != nil {
+			return nil, fmt.Errorf("failed to check token '%s': %w", token.Text, err)
+		}
+		if matches {
+			matchingTokens = append(matchingTokens, token)
+		}
+	}
+
+	return matchingTokens, nil
+}
+
+// FindMatchingTokenSequences finds all token sequences that match the pattern
+func (sm *SnippetMatcher) FindMatchingTokenSequences(snippet string, maxSequenceLength int) ([][]TokenSpan, error) {
+	tokens, err := sm.ParseSnippet(snippet)
+	if err != nil {
+		return nil, err
+	}
+
+	if maxSequenceLength <= 0 {
+		maxSequenceLength = len(tokens)
+	}
+
+	matchingSequences := make([][]TokenSpan, 0)
+
+	// Check all possible token sequences up to maxSequenceLength
+	for start := 0; start < len(tokens); start++ {
+		for length := 1; length <= maxSequenceLength && start+length <= len(tokens); length++ {
+			sequence := tokens[start : start+length]
+
+			matches, err := sm.CheckTokenSequence(sequence)
+			if err != nil {
+				return nil, fmt.Errorf("failed to check token sequence: %w", err)
+			}
+			if matches {
+				matchingSequences = append(matchingSequences, sequence)
+			}
+		}
+	}
+
+	return matchingSequences, nil
+}
+
+// GetReplacement returns the replacement node from the matcher
+func (sm *SnippetMatcher) GetReplacement() ast.Node {
+	return sm.matcher.replacement.Root
+}
+
+// Helper function to extract token texts
+func getTokenTexts(tokens []TokenSpan) []string {
+	texts := make([]string, len(tokens))
+	for i, token := range tokens {
+		texts[i] = token.Text
+	}
+	return texts
+}

diff --git a/matcher/snippet_matcher_test.go b/matcher/snippet_matcher_test.go
new file mode 100644
index 0000000..f740d65
--- /dev/null
+++ b/matcher/snippet_matcher_test.go

@@ -0,0 +1,472 @@
+package matcher
+
+import (
+	"testing"
+
+	"github.com/KorAP/KoralPipe-TermMapper/ast"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestSnippetMatcher_ParseSnippet(t *testing.T) {
+	// Create a pattern for testing
+	pattern := ast.Pattern{
+		Root: &ast.Term{
+			Foundry: "marmot",
+			Layer:   "m",
+			Key:     "gender",
+			Value:   "masc",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	replacement := ast.Replacement{
+		Root: &ast.Term{
+			Foundry: "opennlp",
+			Layer:   "m",
+			Key:     "M",
+			Value:   "",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	sm, err := NewSnippetMatcher(pattern, replacement)
+	require.NoError(t, err)
+
+	tests := []struct {
+		name             string
+		snippet          string
+		expectedTokens   int
+		expectedContains []string
+	}{
+		{
+			name: "Simple single token",
+			snippet: `<span title="corenlp/p:ART">
+				<span title="marmot/m:case:nom">
+				<span title="marmot/m:gender:masc">
+				<span title="marmot/m:number:sg">
+				<span title="marmot/p:ART">
+				Der</span>
+				</span>
+				</span>
+				</span>
+				</span>`,
+			expectedTokens:   1,
+			expectedContains: []string{"Der"},
+		},
+		{
+			name: "Multiple tokens",
+			snippet: `<span title="corenlp/p:ART">
+				<span title="marmot/m:case:nom">
+				<span title="marmot/m:gender:masc">
+				Der</span>
+				</span>
+				</span> 
+				<span title="corenlp/p:ADJA">
+				<span title="marmot/m:case:nom">
+				<span title="marmot/m:gender:masc">
+				alte</span>
+				</span>
+				</span>`,
+			expectedTokens:   2,
+			expectedContains: []string{"Der", "alte"},
+		},
+		{
+			name: "Real-world example from test",
+			snippet: `<span title="corenlp/p:ART">
+				<span title="marmot/m:case:nom">
+				<span title="marmot/m:gender:masc">
+				<span title="marmot/m:number:sg">
+				<span title="marmot/p:ART">
+				<span title="opennlp/p:ART">
+				<span title="tt/l:die">
+				<span title="tt/p:ART">Der</span>
+				</span>
+				</span>
+				</span>
+				</span>
+				</span>
+				</span>
+				</span>`,
+			expectedTokens:   1,
+			expectedContains: []string{"Der"},
+		},
+		{
+			name:             "Empty snippet",
+			snippet:          "",
+			expectedTokens:   0,
+			expectedContains: []string{},
+		},
+		{
+			name:             "No span elements",
+			snippet:          "Just some text",
+			expectedTokens:   0,
+			expectedContains: []string{},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tokens, err := sm.ParseSnippet(tt.snippet)
+			require.NoError(t, err)
+
+			assert.Len(t, tokens, tt.expectedTokens)
+
+			for i, expectedText := range tt.expectedContains {
+				if i < len(tokens) {
+					assert.Equal(t, expectedText, tokens[i].Text)
+				}
+			}
+		})
+	}
+}
+
+func TestSnippetMatcher_CheckToken(t *testing.T) {
+	// Create a pattern that matches tokens with marmot/m:gender=masc
+	pattern := ast.Pattern{
+		Root: &ast.Term{
+			Foundry: "marmot",
+			Layer:   "m",
+			Key:     "gender",
+			Value:   "masc",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	replacement := ast.Replacement{
+		Root: &ast.Term{
+			Foundry: "opennlp",
+			Layer:   "m",
+			Key:     "M",
+			Value:   "",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	sm, err := NewSnippetMatcher(pattern, replacement)
+	require.NoError(t, err)
+
+	tests := []struct {
+		name        string
+		token       TokenSpan
+		shouldMatch bool
+	}{
+		{
+			name: "Token with matching annotation",
+			token: TokenSpan{
+				Text: "Der",
+				Annotations: []string{
+					"corenlp/p:ART",
+					"marmot/m:case:nom",
+					"marmot/m:gender:masc",
+					"marmot/m:number:sg",
+				},
+			},
+			shouldMatch: true,
+		},
+		{
+			name: "Token without matching annotation",
+			token: TokenSpan{
+				Text: "und",
+				Annotations: []string{
+					"corenlp/p:KON",
+					"marmot/p:KON",
+					"opennlp/p:KON",
+				},
+			},
+			shouldMatch: false,
+		},
+		{
+			name: "Token with no annotations",
+			token: TokenSpan{
+				Text:        "text",
+				Annotations: []string{},
+			},
+			shouldMatch: false,
+		},
+		{
+			name: "Token with different gender value",
+			token: TokenSpan{
+				Text: "andere",
+				Annotations: []string{
+					"marmot/m:gender:fem",
+					"marmot/m:case:nom",
+				},
+			},
+			shouldMatch: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			matches, err := sm.CheckToken(tt.token)
+			require.NoError(t, err)
+			assert.Equal(t, tt.shouldMatch, matches)
+		})
+	}
+}
+
+func TestSnippetMatcher_FindMatchingTokens(t *testing.T) {
+	// Create a pattern that matches tokens with marmot/m:gender=masc
+	pattern := ast.Pattern{
+		Root: &ast.Term{
+			Foundry: "marmot",
+			Layer:   "m",
+			Key:     "gender",
+			Value:   "masc",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	replacement := ast.Replacement{
+		Root: &ast.Term{
+			Foundry: "opennlp",
+			Layer:   "m",
+			Key:     "M",
+			Value:   "",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	sm, err := NewSnippetMatcher(pattern, replacement)
+	require.NoError(t, err)
+
+	// Test snippet with mixed tokens - some matching, some not
+	snippet := `<span title="corenlp/p:ART">
+		<span title="marmot/m:case:nom">
+		<span title="marmot/m:gender:masc">
+		<span title="marmot/m:number:sg">
+		Der</span>
+		</span>
+		</span>
+		</span> 
+		<span title="corenlp/p:ADJA">
+		<span title="marmot/m:case:nom">
+		<span title="marmot/m:gender:masc">
+		alte</span>
+		</span>
+		</span> 
+		<span title="corenlp/p:NN">
+		<span title="marmot/m:case:nom">
+		<span title="marmot/m:gender:masc">
+		Baum</span>
+		</span>
+		</span> 
+		<span title="corenlp/p:KON">
+		<span title="marmot/p:KON">
+		und</span>
+		</span>`
+
+	matchingTokens, err := sm.FindMatchingTokens(snippet)
+	require.NoError(t, err)
+
+	// Should find 3 matching tokens: "Der", "alte", "Baum" (all with gender:masc)
+	// but not "und" (no gender annotation)
+	assert.Len(t, matchingTokens, 3)
+
+	expectedTexts := []string{"Der", "alte", "Baum"}
+	for i, token := range matchingTokens {
+		assert.Equal(t, expectedTexts[i], token.Text)
+
+		// Verify that each token has the required annotation
+		hasGenderMasc := false
+		for _, annotation := range token.Annotations {
+			if annotation == "marmot/m:gender:masc" {
+				hasGenderMasc = true
+				break
+			}
+		}
+		assert.True(t, hasGenderMasc, "Token %s should have marmot/m:gender:masc annotation", token.Text)
+	}
+}
+
+func TestSnippetMatcher_CheckTokenSequence(t *testing.T) {
+	// Create a pattern for testing
+	pattern := ast.Pattern{
+		Root: &ast.Term{
+			Foundry: "marmot",
+			Layer:   "m",
+			Key:     "gender",
+			Value:   "masc",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	replacement := ast.Replacement{
+		Root: &ast.Term{
+			Foundry: "opennlp",
+			Layer:   "m",
+			Key:     "M",
+			Value:   "",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	sm, err := NewSnippetMatcher(pattern, replacement)
+	require.NoError(t, err)
+
+	tests := []struct {
+		name        string
+		tokens      []TokenSpan
+		shouldMatch bool
+	}{
+		{
+			name: "Sequence with matching token",
+			tokens: []TokenSpan{
+				{
+					Text: "Der",
+					Annotations: []string{
+						"marmot/m:gender:masc",
+						"marmot/m:case:nom",
+					},
+				},
+				{
+					Text: "alte",
+					Annotations: []string{
+						"marmot/m:gender:fem",
+						"marmot/m:case:nom",
+					},
+				},
+			},
+			shouldMatch: true, // First token matches
+		},
+		{
+			name: "Sequence with no matching tokens",
+			tokens: []TokenSpan{
+				{
+					Text: "und",
+					Annotations: []string{
+						"marmot/p:KON",
+					},
+				},
+				{
+					Text: "oder",
+					Annotations: []string{
+						"marmot/p:KON",
+					},
+				},
+			},
+			shouldMatch: false,
+		},
+		{
+			name:        "Empty sequence",
+			tokens:      []TokenSpan{},
+			shouldMatch: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			matches, err := sm.CheckTokenSequence(tt.tokens)
+			require.NoError(t, err)
+			assert.Equal(t, tt.shouldMatch, matches)
+		})
+	}
+}
+
+func TestSnippetMatcher_GetReplacement(t *testing.T) {
+	pattern := ast.Pattern{
+		Root: &ast.Term{
+			Foundry: "marmot",
+			Layer:   "m",
+			Key:     "gender",
+			Value:   "masc",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	replacement := ast.Replacement{
+		Root: &ast.Term{
+			Foundry: "opennlp",
+			Layer:   "m",
+			Key:     "M",
+			Value:   "",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	sm, err := NewSnippetMatcher(pattern, replacement)
+	require.NoError(t, err)
+
+	replacementNode := sm.GetReplacement()
+	require.NotNil(t, replacementNode)
+
+	term, ok := replacementNode.(*ast.Term)
+	require.True(t, ok)
+	assert.Equal(t, "opennlp", term.Foundry)
+	assert.Equal(t, "m", term.Layer)
+	assert.Equal(t, "M", term.Key)
+}
+
+func TestSnippetMatcher_RealWorldExample(t *testing.T) {
+	// Test with the real-world example from the response test
+	pattern := ast.Pattern{
+		Root: &ast.Term{
+			Foundry: "marmot",
+			Layer:   "m",
+			Key:     "gender",
+			Value:   "masc",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	replacement := ast.Replacement{
+		Root: &ast.Term{
+			Foundry: "opennlp",
+			Layer:   "m",
+			Key:     "M",
+			Value:   "",
+			Match:   ast.MatchEqual,
+		},
+	}
+
+	sm, err := NewSnippetMatcher(pattern, replacement)
+	require.NoError(t, err)
+
+	// Real snippet from the test file
+	snippet := `<span title="corenlp/p:ART">` +
+		`<span title="marmot/m:case:nom">` +
+		`<span title="marmot/m:gender:masc">` +
+		`<span title="marmot/m:number:sg">` +
+		`<span title="marmot/p:ART">` +
+		`<span title="opennlp/p:ART">` +
+		`<span title="tt/l:die">` +
+		`<span title="tt/p:ART">Der</span>` +
+		`</span>` +
+		`</span>` +
+		`</span>` +
+		`</span>` +
+		`</span>` +
+		`</span>` +
+		`</span>`
+
+	// Parse the snippet
+	tokens, err := sm.ParseSnippet(snippet)
+	require.NoError(t, err)
+	require.Len(t, tokens, 1)
+
+	token := tokens[0]
+	assert.Equal(t, "Der", token.Text)
+
+	// Check that it has all expected annotations
+	expectedAnnotations := []string{
+		"corenlp/p:ART",
+		"marmot/m:case:nom",
+		"marmot/m:gender:masc",
+		"marmot/m:number:sg",
+		"marmot/p:ART",
+		"opennlp/p:ART",
+		"tt/l:die",
+		"tt/p:ART",
+	}
+
+	assert.Len(t, token.Annotations, len(expectedAnnotations))
+	for _, expected := range expectedAnnotations {
+		assert.Contains(t, token.Annotations, expected)
+	}
+
+	// Check that it matches our pattern
+	matches, err := sm.CheckToken(token)
+	require.NoError(t, err)
+	assert.True(t, matches)
+}
commit	cc9a8a6b7514821516c37033b0c1594e1198aa02	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Jun 25 11:56:28 2025 +0200
committer	Akron <nils@diewald-online.de>	Wed Jun 25 11:56:28 2025 +0200
tree	a95c4156e7de1d3c921c1f6f53008dfe0245d55d
parent	e562ad6cdc7f8118cc285f940aab82ab78e64ce4 [diff]