blob: fe664a6d2f9dd834bd6aa5db4ae27dc2a730cb55 [file] [log] [blame]
Akroncc9a8a62025-06-25 11:56:28 +02001package matcher
2
3import (
4 "fmt"
5 "sort"
6 "strings"
7
Akron2ef703c2025-07-03 15:57:42 +02008 "github.com/KorAP/Koral-Mapper/ast"
9 "github.com/KorAP/Koral-Mapper/parser"
Akroncc9a8a62025-06-25 11:56:28 +020010 "github.com/orisano/gosax"
11)
12
13// TokenSpan represents a token and its position in the snippet
14type TokenSpan struct {
15 Text string // The actual token text
16 StartPos int // Character position where the token starts
17 EndPos int // Character position where the token ends
18 Annotations []string // All title attributes that annotate this token
19}
20
21// SnippetMatcher extends the basic matcher to work with HTML/XML snippets
22type SnippetMatcher struct {
23 matcher *Matcher
24 titleParser *parser.TitleAttributeParser
25}
26
27// NewSnippetMatcher creates a new snippet matcher
28func NewSnippetMatcher(pattern ast.Pattern, replacement ast.Replacement) (*SnippetMatcher, error) {
29 matcher, err := NewMatcher(pattern, replacement)
30 if err != nil {
31 return nil, fmt.Errorf("failed to create base matcher: %w", err)
32 }
33
34 return &SnippetMatcher{
35 matcher: matcher,
36 titleParser: parser.NewTitleAttributeParser(),
37 }, nil
38}
39
40// ParseSnippet parses an HTML/XML snippet and extracts tokens with their annotations
41func (sm *SnippetMatcher) ParseSnippet(snippet string) ([]TokenSpan, error) {
42 tokens := make([]TokenSpan, 0)
43
44 // Stack to track nested spans and their annotations
45 type spanInfo struct {
46 title string
47 level int
48 }
49 spanStack := make([]spanInfo, 0)
50
51 // Current position tracking
52 var currentPos int
53
54 reader := strings.NewReader(snippet)
55 r := gosax.NewReader(reader)
56
57 for {
58 e, err := r.Event()
59 if err != nil {
60 return nil, fmt.Errorf("failed to parse snippet: %w", err)
61 }
62
63 if e.Type() == 8 { // gosax.EventEOF
64 break
65 }
66
67 switch e.Type() {
68 case 1: // gosax.EventStart
69 // Parse start element
70 startElem, err := gosax.StartElement(e.Bytes)
71 if err != nil {
72 continue // Skip invalid elements
73 }
74
75 if startElem.Name.Local == "span" {
76 // Look for title attribute
77 var title string
78 for _, attr := range startElem.Attr {
79 if attr.Name.Local == "title" {
80 title = attr.Value
81 break
82 }
83 }
84 spanStack = append(spanStack, spanInfo{title: title, level: len(spanStack)})
85 }
86
87 case 2: // gosax.EventEnd
88 // Parse end element
89 endElem := gosax.EndElement(e.Bytes)
90 if endElem.Name.Local == "span" && len(spanStack) > 0 {
91 spanStack = spanStack[:len(spanStack)-1]
92 }
93
94 case 3: // gosax.EventText
95 // Process character data
96 charData, err := gosax.CharData(e.Bytes)
97 if err != nil {
98 continue
99 }
100
101 text := string(charData)
102 trimmed := strings.TrimSpace(text)
103 if trimmed != "" && len(spanStack) > 0 {
104 // Only create tokens if we're inside at least one span
105 // Collect all annotations from the current span stack
106 annotations := make([]string, 0)
107 for _, span := range spanStack {
108 if span.title != "" {
109 annotations = append(annotations, span.title)
110 }
111 }
112
113 // Create token span
114 token := TokenSpan{
115 Text: trimmed,
116 StartPos: currentPos,
117 EndPos: currentPos + len(trimmed),
118 Annotations: annotations,
119 }
120 tokens = append(tokens, token)
121 }
122 currentPos += len(text)
123 }
124 }
125
126 // Sort tokens by start position to ensure proper order
127 sort.Slice(tokens, func(i, j int) bool {
128 return tokens[i].StartPos < tokens[j].StartPos
129 })
130
131 return tokens, nil
132}
133
134// CheckToken checks if a token's annotations match the pattern
135func (sm *SnippetMatcher) CheckToken(token TokenSpan) (bool, error) {
136 if len(token.Annotations) == 0 {
137 return false, nil
138 }
139
140 // Parse all annotations into AST terms
141 terms, err := sm.titleParser.ParseTitleAttributesToTerms(token.Annotations)
142 if err != nil {
143 return false, fmt.Errorf("failed to parse token annotations: %w", err)
144 }
145
146 if len(terms) == 0 {
147 return false, nil
148 }
149
150 // Create a TermGroup with AND relation for all annotations
151 var nodeToMatch ast.Node
152 if len(terms) == 1 {
153 nodeToMatch = terms[0]
154 } else {
155 nodeToMatch = &ast.TermGroup{
156 Operands: terms,
157 Relation: ast.AndRelation,
158 }
159 }
160
161 // Check if the constructed node matches our pattern
162 return sm.matcher.Match(nodeToMatch), nil
163}
164
Akroncc9a8a62025-06-25 11:56:28 +0200165// FindMatchingTokens finds all tokens in the snippet that match the pattern
166func (sm *SnippetMatcher) FindMatchingTokens(snippet string) ([]TokenSpan, error) {
167 tokens, err := sm.ParseSnippet(snippet)
168 if err != nil {
169 return nil, err
170 }
171
172 matchingTokens := make([]TokenSpan, 0)
Akroncc9a8a62025-06-25 11:56:28 +0200173 for _, token := range tokens {
Akron21e47762025-07-03 14:11:11 +0200174 if matches, err := sm.CheckToken(token); err != nil {
Akroncc9a8a62025-06-25 11:56:28 +0200175 return nil, fmt.Errorf("failed to check token '%s': %w", token.Text, err)
Akron21e47762025-07-03 14:11:11 +0200176 } else if matches {
Akroncc9a8a62025-06-25 11:56:28 +0200177 matchingTokens = append(matchingTokens, token)
178 }
179 }
180
181 return matchingTokens, nil
182}