Implemented snippet_matcher to parse snippets and match against mapping lists Change-Id: I79c7736dbfcc9f6ca61486e9211647974f011bd3

commit: cc9a8a6b7514821516c37033b0c1594e1198aa02 [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Jun 25 11:56:28 2025 +0200
committer: Akron <nils@diewald-online.de> Wed Jun 25 11:56:28 2025 +0200
tree: a95c4156e7de1d3c921c1f6f53008dfe0245d55d
parent: e562ad6cdc7f8118cc285f940aab82ab78e64ce4 [diff] [blame]
diff --git a/matcher/snippet_matcher.go b/matcher/snippet_matcher.go
new file mode 100644
index 0000000..35e9461
--- /dev/null
+++ b/matcher/snippet_matcher.go

@@ -0,0 +1,282 @@
+package matcher
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+
+	"github.com/KorAP/KoralPipe-TermMapper/ast"
+	"github.com/KorAP/KoralPipe-TermMapper/parser"
+	"github.com/orisano/gosax"
+)
+
+// TokenSpan represents a token and its position in the snippet
+type TokenSpan struct {
+	Text        string   // The actual token text
+	StartPos    int      // Character position where the token starts
+	EndPos      int      // Character position where the token ends
+	Annotations []string // All title attributes that annotate this token
+}
+
+// SnippetMatcher extends the basic matcher to work with HTML/XML snippets
+type SnippetMatcher struct {
+	matcher     *Matcher
+	titleParser *parser.TitleAttributeParser
+}
+
+// NewSnippetMatcher creates a new snippet matcher
+func NewSnippetMatcher(pattern ast.Pattern, replacement ast.Replacement) (*SnippetMatcher, error) {
+	matcher, err := NewMatcher(pattern, replacement)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create base matcher: %w", err)
+	}
+
+	return &SnippetMatcher{
+		matcher:     matcher,
+		titleParser: parser.NewTitleAttributeParser(),
+	}, nil
+}
+
+// ParseSnippet parses an HTML/XML snippet and extracts tokens with their annotations
+func (sm *SnippetMatcher) ParseSnippet(snippet string) ([]TokenSpan, error) {
+	tokens := make([]TokenSpan, 0)
+
+	// Stack to track nested spans and their annotations
+	type spanInfo struct {
+		title string
+		level int
+	}
+	spanStack := make([]spanInfo, 0)
+
+	// Current position tracking
+	var currentPos int
+
+	reader := strings.NewReader(snippet)
+	r := gosax.NewReader(reader)
+
+	for {
+		e, err := r.Event()
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse snippet: %w", err)
+		}
+
+		if e.Type() == 8 { // gosax.EventEOF
+			break
+		}
+
+		switch e.Type() {
+		case 1: // gosax.EventStart
+			// Parse start element
+			startElem, err := gosax.StartElement(e.Bytes)
+			if err != nil {
+				continue // Skip invalid elements
+			}
+
+			if startElem.Name.Local == "span" {
+				// Look for title attribute
+				var title string
+				for _, attr := range startElem.Attr {
+					if attr.Name.Local == "title" {
+						title = attr.Value
+						break
+					}
+				}
+				spanStack = append(spanStack, spanInfo{title: title, level: len(spanStack)})
+			}
+
+		case 2: // gosax.EventEnd
+			// Parse end element
+			endElem := gosax.EndElement(e.Bytes)
+			if endElem.Name.Local == "span" && len(spanStack) > 0 {
+				spanStack = spanStack[:len(spanStack)-1]
+			}
+
+		case 3: // gosax.EventText
+			// Process character data
+			charData, err := gosax.CharData(e.Bytes)
+			if err != nil {
+				continue
+			}
+
+			text := string(charData)
+			trimmed := strings.TrimSpace(text)
+			if trimmed != "" && len(spanStack) > 0 {
+				// Only create tokens if we're inside at least one span
+				// Collect all annotations from the current span stack
+				annotations := make([]string, 0)
+				for _, span := range spanStack {
+					if span.title != "" {
+						annotations = append(annotations, span.title)
+					}
+				}
+
+				// Create token span
+				token := TokenSpan{
+					Text:        trimmed,
+					StartPos:    currentPos,
+					EndPos:      currentPos + len(trimmed),
+					Annotations: annotations,
+				}
+				tokens = append(tokens, token)
+			}
+			currentPos += len(text)
+		}
+	}
+
+	// Sort tokens by start position to ensure proper order
+	sort.Slice(tokens, func(i, j int) bool {
+		return tokens[i].StartPos < tokens[j].StartPos
+	})
+
+	return tokens, nil
+}
+
+// CheckToken checks if a token's annotations match the pattern
+func (sm *SnippetMatcher) CheckToken(token TokenSpan) (bool, error) {
+	if len(token.Annotations) == 0 {
+		return false, nil
+	}
+
+	// Parse all annotations into AST terms
+	terms, err := sm.titleParser.ParseTitleAttributesToTerms(token.Annotations)
+	if err != nil {
+		return false, fmt.Errorf("failed to parse token annotations: %w", err)
+	}
+
+	if len(terms) == 0 {
+		return false, nil
+	}
+
+	// Create a TermGroup with AND relation for all annotations
+	var nodeToMatch ast.Node
+	if len(terms) == 1 {
+		nodeToMatch = terms[0]
+	} else {
+		nodeToMatch = &ast.TermGroup{
+			Operands: terms,
+			Relation: ast.AndRelation,
+		}
+	}
+
+	// Check if the constructed node matches our pattern
+	return sm.matcher.Match(nodeToMatch), nil
+}
+
+// CheckTokenSequence checks if a sequence of tokens matches the pattern
+func (sm *SnippetMatcher) CheckTokenSequence(tokens []TokenSpan) (bool, error) {
+	if len(tokens) == 0 {
+		return false, nil
+	}
+
+	// For token sequences, we need to check different strategies:
+	// 1. Check if any individual token matches
+	// 2. Check if the combined annotations of all tokens match
+
+	// Strategy 1: Check individual tokens
+	for _, token := range tokens {
+		matches, err := sm.CheckToken(token)
+		if err != nil {
+			return false, err
+		}
+		if matches {
+			return true, nil
+		}
+	}
+
+	// Strategy 2: Check combined annotations
+	allAnnotations := make([]string, 0)
+	for _, token := range tokens {
+		allAnnotations = append(allAnnotations, token.Annotations...)
+	}
+
+	// Remove duplicates from combined annotations
+	annotationMap := make(map[string]bool)
+	uniqueAnnotations := make([]string, 0)
+	for _, annotation := range allAnnotations {
+		if !annotationMap[annotation] {
+			annotationMap[annotation] = true
+			uniqueAnnotations = append(uniqueAnnotations, annotation)
+		}
+	}
+
+	if len(uniqueAnnotations) == 0 {
+		return false, nil
+	}
+
+	// Create a combined token for checking
+	combinedToken := TokenSpan{
+		Text:        strings.Join(getTokenTexts(tokens), " "),
+		StartPos:    tokens[0].StartPos,
+		EndPos:      tokens[len(tokens)-1].EndPos,
+		Annotations: uniqueAnnotations,
+	}
+
+	return sm.CheckToken(combinedToken)
+}
+
+// FindMatchingTokens finds all tokens in the snippet that match the pattern
+func (sm *SnippetMatcher) FindMatchingTokens(snippet string) ([]TokenSpan, error) {
+	tokens, err := sm.ParseSnippet(snippet)
+	if err != nil {
+		return nil, err
+	}
+
+	matchingTokens := make([]TokenSpan, 0)
+
+	for _, token := range tokens {
+		matches, err := sm.CheckToken(token)
+		if err != nil {
+			return nil, fmt.Errorf("failed to check token '%s': %w", token.Text, err)
+		}
+		if matches {
+			matchingTokens = append(matchingTokens, token)
+		}
+	}
+
+	return matchingTokens, nil
+}
+
+// FindMatchingTokenSequences finds all token sequences that match the pattern
+func (sm *SnippetMatcher) FindMatchingTokenSequences(snippet string, maxSequenceLength int) ([][]TokenSpan, error) {
+	tokens, err := sm.ParseSnippet(snippet)
+	if err != nil {
+		return nil, err
+	}
+
+	if maxSequenceLength <= 0 {
+		maxSequenceLength = len(tokens)
+	}
+
+	matchingSequences := make([][]TokenSpan, 0)
+
+	// Check all possible token sequences up to maxSequenceLength
+	for start := 0; start < len(tokens); start++ {
+		for length := 1; length <= maxSequenceLength && start+length <= len(tokens); length++ {
+			sequence := tokens[start : start+length]
+
+			matches, err := sm.CheckTokenSequence(sequence)
+			if err != nil {
+				return nil, fmt.Errorf("failed to check token sequence: %w", err)
+			}
+			if matches {
+				matchingSequences = append(matchingSequences, sequence)
+			}
+		}
+	}
+
+	return matchingSequences, nil
+}
+
+// GetReplacement returns the replacement node from the matcher
+func (sm *SnippetMatcher) GetReplacement() ast.Node {
+	return sm.matcher.replacement.Root
+}
+
+// Helper function to extract token texts
+func getTokenTexts(tokens []TokenSpan) []string {
+	texts := make([]string, len(tokens))
+	for i, token := range tokens {
+		texts[i] = token.Text
+	}
+	return texts
+}
commit	cc9a8a6b7514821516c37033b0c1594e1198aa02	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Jun 25 11:56:28 2025 +0200
committer	Akron <nils@diewald-online.de>	Wed Jun 25 11:56:28 2025 +0200
tree	a95c4156e7de1d3c921c1f6f53008dfe0245d55d
parent	e562ad6cdc7f8118cc285f940aab82ab78e64ce4 [diff] [blame]