blob: 35e94614c49bd9e39af713526376616d8ecf063f [file] [log] [blame]
Akroncc9a8a62025-06-25 11:56:28 +02001package matcher
2
3import (
4 "fmt"
5 "sort"
6 "strings"
7
8 "github.com/KorAP/KoralPipe-TermMapper/ast"
9 "github.com/KorAP/KoralPipe-TermMapper/parser"
10 "github.com/orisano/gosax"
11)
12
13// TokenSpan represents a token and its position in the snippet
14type TokenSpan struct {
15 Text string // The actual token text
16 StartPos int // Character position where the token starts
17 EndPos int // Character position where the token ends
18 Annotations []string // All title attributes that annotate this token
19}
20
21// SnippetMatcher extends the basic matcher to work with HTML/XML snippets
22type SnippetMatcher struct {
23 matcher *Matcher
24 titleParser *parser.TitleAttributeParser
25}
26
27// NewSnippetMatcher creates a new snippet matcher
28func NewSnippetMatcher(pattern ast.Pattern, replacement ast.Replacement) (*SnippetMatcher, error) {
29 matcher, err := NewMatcher(pattern, replacement)
30 if err != nil {
31 return nil, fmt.Errorf("failed to create base matcher: %w", err)
32 }
33
34 return &SnippetMatcher{
35 matcher: matcher,
36 titleParser: parser.NewTitleAttributeParser(),
37 }, nil
38}
39
40// ParseSnippet parses an HTML/XML snippet and extracts tokens with their annotations
41func (sm *SnippetMatcher) ParseSnippet(snippet string) ([]TokenSpan, error) {
42 tokens := make([]TokenSpan, 0)
43
44 // Stack to track nested spans and their annotations
45 type spanInfo struct {
46 title string
47 level int
48 }
49 spanStack := make([]spanInfo, 0)
50
51 // Current position tracking
52 var currentPos int
53
54 reader := strings.NewReader(snippet)
55 r := gosax.NewReader(reader)
56
57 for {
58 e, err := r.Event()
59 if err != nil {
60 return nil, fmt.Errorf("failed to parse snippet: %w", err)
61 }
62
63 if e.Type() == 8 { // gosax.EventEOF
64 break
65 }
66
67 switch e.Type() {
68 case 1: // gosax.EventStart
69 // Parse start element
70 startElem, err := gosax.StartElement(e.Bytes)
71 if err != nil {
72 continue // Skip invalid elements
73 }
74
75 if startElem.Name.Local == "span" {
76 // Look for title attribute
77 var title string
78 for _, attr := range startElem.Attr {
79 if attr.Name.Local == "title" {
80 title = attr.Value
81 break
82 }
83 }
84 spanStack = append(spanStack, spanInfo{title: title, level: len(spanStack)})
85 }
86
87 case 2: // gosax.EventEnd
88 // Parse end element
89 endElem := gosax.EndElement(e.Bytes)
90 if endElem.Name.Local == "span" && len(spanStack) > 0 {
91 spanStack = spanStack[:len(spanStack)-1]
92 }
93
94 case 3: // gosax.EventText
95 // Process character data
96 charData, err := gosax.CharData(e.Bytes)
97 if err != nil {
98 continue
99 }
100
101 text := string(charData)
102 trimmed := strings.TrimSpace(text)
103 if trimmed != "" && len(spanStack) > 0 {
104 // Only create tokens if we're inside at least one span
105 // Collect all annotations from the current span stack
106 annotations := make([]string, 0)
107 for _, span := range spanStack {
108 if span.title != "" {
109 annotations = append(annotations, span.title)
110 }
111 }
112
113 // Create token span
114 token := TokenSpan{
115 Text: trimmed,
116 StartPos: currentPos,
117 EndPos: currentPos + len(trimmed),
118 Annotations: annotations,
119 }
120 tokens = append(tokens, token)
121 }
122 currentPos += len(text)
123 }
124 }
125
126 // Sort tokens by start position to ensure proper order
127 sort.Slice(tokens, func(i, j int) bool {
128 return tokens[i].StartPos < tokens[j].StartPos
129 })
130
131 return tokens, nil
132}
133
134// CheckToken checks if a token's annotations match the pattern
135func (sm *SnippetMatcher) CheckToken(token TokenSpan) (bool, error) {
136 if len(token.Annotations) == 0 {
137 return false, nil
138 }
139
140 // Parse all annotations into AST terms
141 terms, err := sm.titleParser.ParseTitleAttributesToTerms(token.Annotations)
142 if err != nil {
143 return false, fmt.Errorf("failed to parse token annotations: %w", err)
144 }
145
146 if len(terms) == 0 {
147 return false, nil
148 }
149
150 // Create a TermGroup with AND relation for all annotations
151 var nodeToMatch ast.Node
152 if len(terms) == 1 {
153 nodeToMatch = terms[0]
154 } else {
155 nodeToMatch = &ast.TermGroup{
156 Operands: terms,
157 Relation: ast.AndRelation,
158 }
159 }
160
161 // Check if the constructed node matches our pattern
162 return sm.matcher.Match(nodeToMatch), nil
163}
164
165// CheckTokenSequence checks if a sequence of tokens matches the pattern
166func (sm *SnippetMatcher) CheckTokenSequence(tokens []TokenSpan) (bool, error) {
167 if len(tokens) == 0 {
168 return false, nil
169 }
170
171 // For token sequences, we need to check different strategies:
172 // 1. Check if any individual token matches
173 // 2. Check if the combined annotations of all tokens match
174
175 // Strategy 1: Check individual tokens
176 for _, token := range tokens {
177 matches, err := sm.CheckToken(token)
178 if err != nil {
179 return false, err
180 }
181 if matches {
182 return true, nil
183 }
184 }
185
186 // Strategy 2: Check combined annotations
187 allAnnotations := make([]string, 0)
188 for _, token := range tokens {
189 allAnnotations = append(allAnnotations, token.Annotations...)
190 }
191
192 // Remove duplicates from combined annotations
193 annotationMap := make(map[string]bool)
194 uniqueAnnotations := make([]string, 0)
195 for _, annotation := range allAnnotations {
196 if !annotationMap[annotation] {
197 annotationMap[annotation] = true
198 uniqueAnnotations = append(uniqueAnnotations, annotation)
199 }
200 }
201
202 if len(uniqueAnnotations) == 0 {
203 return false, nil
204 }
205
206 // Create a combined token for checking
207 combinedToken := TokenSpan{
208 Text: strings.Join(getTokenTexts(tokens), " "),
209 StartPos: tokens[0].StartPos,
210 EndPos: tokens[len(tokens)-1].EndPos,
211 Annotations: uniqueAnnotations,
212 }
213
214 return sm.CheckToken(combinedToken)
215}
216
217// FindMatchingTokens finds all tokens in the snippet that match the pattern
218func (sm *SnippetMatcher) FindMatchingTokens(snippet string) ([]TokenSpan, error) {
219 tokens, err := sm.ParseSnippet(snippet)
220 if err != nil {
221 return nil, err
222 }
223
224 matchingTokens := make([]TokenSpan, 0)
225
226 for _, token := range tokens {
227 matches, err := sm.CheckToken(token)
228 if err != nil {
229 return nil, fmt.Errorf("failed to check token '%s': %w", token.Text, err)
230 }
231 if matches {
232 matchingTokens = append(matchingTokens, token)
233 }
234 }
235
236 return matchingTokens, nil
237}
238
239// FindMatchingTokenSequences finds all token sequences that match the pattern
240func (sm *SnippetMatcher) FindMatchingTokenSequences(snippet string, maxSequenceLength int) ([][]TokenSpan, error) {
241 tokens, err := sm.ParseSnippet(snippet)
242 if err != nil {
243 return nil, err
244 }
245
246 if maxSequenceLength <= 0 {
247 maxSequenceLength = len(tokens)
248 }
249
250 matchingSequences := make([][]TokenSpan, 0)
251
252 // Check all possible token sequences up to maxSequenceLength
253 for start := 0; start < len(tokens); start++ {
254 for length := 1; length <= maxSequenceLength && start+length <= len(tokens); length++ {
255 sequence := tokens[start : start+length]
256
257 matches, err := sm.CheckTokenSequence(sequence)
258 if err != nil {
259 return nil, fmt.Errorf("failed to check token sequence: %w", err)
260 }
261 if matches {
262 matchingSequences = append(matchingSequences, sequence)
263 }
264 }
265 }
266
267 return matchingSequences, nil
268}
269
270// GetReplacement returns the replacement node from the matcher
271func (sm *SnippetMatcher) GetReplacement() ast.Node {
272 return sm.matcher.replacement.Root
273}
274
275// Helper function to extract token texts
276func getTokenTexts(tokens []TokenSpan) []string {
277 texts := make([]string, len(tokens))
278 for i, token := range tokens {
279 texts[i] = token.Text
280 }
281 return texts
282}