Blame - matcher/snippet_matcher.go - KorAP/Koral-Mapper

blob: 35e94614c49bd9e39af713526376616d8ecf063f [file] [log] [blame]

Akron	cc9a8a6	2025-06-25 11:56:28 +0200	[diff] [blame^]	1	package matcher
				2
				3	import (
				4	"fmt"
				5	"sort"
				6	"strings"
				7
				8	"github.com/KorAP/KoralPipe-TermMapper/ast"
				9	"github.com/KorAP/KoralPipe-TermMapper/parser"
				10	"github.com/orisano/gosax"
				11	)
				12
				13	// TokenSpan represents a token and its position in the snippet
				14	type TokenSpan struct {
				15	Text string // The actual token text
				16	StartPos int // Character position where the token starts
				17	EndPos int // Character position where the token ends
				18	Annotations []string // All title attributes that annotate this token
				19	}
				20
				21	// SnippetMatcher extends the basic matcher to work with HTML/XML snippets
				22	type SnippetMatcher struct {
				23	matcher *Matcher
				24	titleParser *parser.TitleAttributeParser
				25	}
				26
				27	// NewSnippetMatcher creates a new snippet matcher
				28	func NewSnippetMatcher(pattern ast.Pattern, replacement ast.Replacement) (*SnippetMatcher, error) {
				29	matcher, err := NewMatcher(pattern, replacement)
				30	if err != nil {
				31	return nil, fmt.Errorf("failed to create base matcher: %w", err)
				32	}
				33
				34	return &SnippetMatcher{
				35	matcher: matcher,
				36	titleParser: parser.NewTitleAttributeParser(),
				37	}, nil
				38	}
				39
				40	// ParseSnippet parses an HTML/XML snippet and extracts tokens with their annotations
				41	func (sm *SnippetMatcher) ParseSnippet(snippet string) ([]TokenSpan, error) {
				42	tokens := make([]TokenSpan, 0)
				43
				44	// Stack to track nested spans and their annotations
				45	type spanInfo struct {
				46	title string
				47	level int
				48	}
				49	spanStack := make([]spanInfo, 0)
				50
				51	// Current position tracking
				52	var currentPos int
				53
				54	reader := strings.NewReader(snippet)
				55	r := gosax.NewReader(reader)
				56
				57	for {
				58	e, err := r.Event()
				59	if err != nil {
				60	return nil, fmt.Errorf("failed to parse snippet: %w", err)
				61	}
				62
				63	if e.Type() == 8 { // gosax.EventEOF
				64	break
				65	}
				66
				67	switch e.Type() {
				68	case 1: // gosax.EventStart
				69	// Parse start element
				70	startElem, err := gosax.StartElement(e.Bytes)
				71	if err != nil {
				72	continue // Skip invalid elements
				73	}
				74
				75	if startElem.Name.Local == "span" {
				76	// Look for title attribute
				77	var title string
				78	for _, attr := range startElem.Attr {
				79	if attr.Name.Local == "title" {
				80	title = attr.Value
				81	break
				82	}
				83	}
				84	spanStack = append(spanStack, spanInfo{title: title, level: len(spanStack)})
				85	}
				86
				87	case 2: // gosax.EventEnd
				88	// Parse end element
				89	endElem := gosax.EndElement(e.Bytes)
				90	if endElem.Name.Local == "span" && len(spanStack) > 0 {
				91	spanStack = spanStack[:len(spanStack)-1]
				92	}
				93
				94	case 3: // gosax.EventText
				95	// Process character data
				96	charData, err := gosax.CharData(e.Bytes)
				97	if err != nil {
				98	continue
				99	}
				100
				101	text := string(charData)
				102	trimmed := strings.TrimSpace(text)
				103	if trimmed != "" && len(spanStack) > 0 {
				104	// Only create tokens if we're inside at least one span
				105	// Collect all annotations from the current span stack
				106	annotations := make([]string, 0)
				107	for _, span := range spanStack {
				108	if span.title != "" {
				109	annotations = append(annotations, span.title)
				110	}
				111	}
				112
				113	// Create token span
				114	token := TokenSpan{
				115	Text: trimmed,
				116	StartPos: currentPos,
				117	EndPos: currentPos + len(trimmed),
				118	Annotations: annotations,
				119	}
				120	tokens = append(tokens, token)
				121	}
				122	currentPos += len(text)
				123	}
				124	}
				125
				126	// Sort tokens by start position to ensure proper order
				127	sort.Slice(tokens, func(i, j int) bool {
				128	return tokens[i].StartPos < tokens[j].StartPos
				129	})
				130
				131	return tokens, nil
				132	}
				133
				134	// CheckToken checks if a token's annotations match the pattern
				135	func (sm *SnippetMatcher) CheckToken(token TokenSpan) (bool, error) {
				136	if len(token.Annotations) == 0 {
				137	return false, nil
				138	}
				139
				140	// Parse all annotations into AST terms
				141	terms, err := sm.titleParser.ParseTitleAttributesToTerms(token.Annotations)
				142	if err != nil {
				143	return false, fmt.Errorf("failed to parse token annotations: %w", err)
				144	}
				145
				146	if len(terms) == 0 {
				147	return false, nil
				148	}
				149
				150	// Create a TermGroup with AND relation for all annotations
				151	var nodeToMatch ast.Node
				152	if len(terms) == 1 {
				153	nodeToMatch = terms[0]
				154	} else {
				155	nodeToMatch = &ast.TermGroup{
				156	Operands: terms,
				157	Relation: ast.AndRelation,
				158	}
				159	}
				160
				161	// Check if the constructed node matches our pattern
				162	return sm.matcher.Match(nodeToMatch), nil
				163	}
				164
				165	// CheckTokenSequence checks if a sequence of tokens matches the pattern
				166	func (sm *SnippetMatcher) CheckTokenSequence(tokens []TokenSpan) (bool, error) {
				167	if len(tokens) == 0 {
				168	return false, nil
				169	}
				170
				171	// For token sequences, we need to check different strategies:
				172	// 1. Check if any individual token matches
				173	// 2. Check if the combined annotations of all tokens match
				174
				175	// Strategy 1: Check individual tokens
				176	for _, token := range tokens {
				177	matches, err := sm.CheckToken(token)
				178	if err != nil {
				179	return false, err
				180	}
				181	if matches {
				182	return true, nil
				183	}
				184	}
				185
				186	// Strategy 2: Check combined annotations
				187	allAnnotations := make([]string, 0)
				188	for _, token := range tokens {
				189	allAnnotations = append(allAnnotations, token.Annotations...)
				190	}
				191
				192	// Remove duplicates from combined annotations
				193	annotationMap := make(map[string]bool)
				194	uniqueAnnotations := make([]string, 0)
				195	for _, annotation := range allAnnotations {
				196	if !annotationMap[annotation] {
				197	annotationMap[annotation] = true
				198	uniqueAnnotations = append(uniqueAnnotations, annotation)
				199	}
				200	}
				201
				202	if len(uniqueAnnotations) == 0 {
				203	return false, nil
				204	}
				205
				206	// Create a combined token for checking
				207	combinedToken := TokenSpan{
				208	Text: strings.Join(getTokenTexts(tokens), " "),
				209	StartPos: tokens[0].StartPos,
				210	EndPos: tokens[len(tokens)-1].EndPos,
				211	Annotations: uniqueAnnotations,
				212	}
				213
				214	return sm.CheckToken(combinedToken)
				215	}
				216
				217	// FindMatchingTokens finds all tokens in the snippet that match the pattern
				218	func (sm *SnippetMatcher) FindMatchingTokens(snippet string) ([]TokenSpan, error) {
				219	tokens, err := sm.ParseSnippet(snippet)
				220	if err != nil {
				221	return nil, err
				222	}
				223
				224	matchingTokens := make([]TokenSpan, 0)
				225
				226	for _, token := range tokens {
				227	matches, err := sm.CheckToken(token)
				228	if err != nil {
				229	return nil, fmt.Errorf("failed to check token '%s': %w", token.Text, err)
				230	}
				231	if matches {
				232	matchingTokens = append(matchingTokens, token)
				233	}
				234	}
				235
				236	return matchingTokens, nil
				237	}
				238
				239	// FindMatchingTokenSequences finds all token sequences that match the pattern
				240	func (sm *SnippetMatcher) FindMatchingTokenSequences(snippet string, maxSequenceLength int) ([][]TokenSpan, error) {
				241	tokens, err := sm.ParseSnippet(snippet)
				242	if err != nil {
				243	return nil, err
				244	}
				245
				246	if maxSequenceLength <= 0 {
				247	maxSequenceLength = len(tokens)
				248	}
				249
				250	matchingSequences := make([][]TokenSpan, 0)
				251
				252	// Check all possible token sequences up to maxSequenceLength
				253	for start := 0; start < len(tokens); start++ {
				254	for length := 1; length <= maxSequenceLength && start+length <= len(tokens); length++ {
				255	sequence := tokens[start : start+length]
				256
				257	matches, err := sm.CheckTokenSequence(sequence)
				258	if err != nil {
				259	return nil, fmt.Errorf("failed to check token sequence: %w", err)
				260	}
				261	if matches {
				262	matchingSequences = append(matchingSequences, sequence)
				263	}
				264	}
				265	}
				266
				267	return matchingSequences, nil
				268	}
				269
				270	// GetReplacement returns the replacement node from the matcher
				271	func (sm *SnippetMatcher) GetReplacement() ast.Node {
				272	return sm.matcher.replacement.Root
				273	}
				274
				275	// Helper function to extract token texts
				276	func getTokenTexts(tokens []TokenSpan) []string {
				277	texts := make([]string, len(tokens))
				278	for i, token := range tokens {
				279	texts[i] = token.Text
				280	}
				281	return texts
				282	}