blob: f740d65aa386231525fc27e1f8084cc6ebdb631c [file] [log] [blame]
Akroncc9a8a62025-06-25 11:56:28 +02001package matcher
2
3import (
4 "testing"
5
6 "github.com/KorAP/KoralPipe-TermMapper/ast"
7 "github.com/stretchr/testify/assert"
8 "github.com/stretchr/testify/require"
9)
10
11func TestSnippetMatcher_ParseSnippet(t *testing.T) {
12 // Create a pattern for testing
13 pattern := ast.Pattern{
14 Root: &ast.Term{
15 Foundry: "marmot",
16 Layer: "m",
17 Key: "gender",
18 Value: "masc",
19 Match: ast.MatchEqual,
20 },
21 }
22
23 replacement := ast.Replacement{
24 Root: &ast.Term{
25 Foundry: "opennlp",
26 Layer: "m",
27 Key: "M",
28 Value: "",
29 Match: ast.MatchEqual,
30 },
31 }
32
33 sm, err := NewSnippetMatcher(pattern, replacement)
34 require.NoError(t, err)
35
36 tests := []struct {
37 name string
38 snippet string
39 expectedTokens int
40 expectedContains []string
41 }{
42 {
43 name: "Simple single token",
44 snippet: `<span title="corenlp/p:ART">
45 <span title="marmot/m:case:nom">
46 <span title="marmot/m:gender:masc">
47 <span title="marmot/m:number:sg">
48 <span title="marmot/p:ART">
49 Der</span>
50 </span>
51 </span>
52 </span>
53 </span>`,
54 expectedTokens: 1,
55 expectedContains: []string{"Der"},
56 },
57 {
58 name: "Multiple tokens",
59 snippet: `<span title="corenlp/p:ART">
60 <span title="marmot/m:case:nom">
61 <span title="marmot/m:gender:masc">
62 Der</span>
63 </span>
64 </span>
65 <span title="corenlp/p:ADJA">
66 <span title="marmot/m:case:nom">
67 <span title="marmot/m:gender:masc">
68 alte</span>
69 </span>
70 </span>`,
71 expectedTokens: 2,
72 expectedContains: []string{"Der", "alte"},
73 },
74 {
75 name: "Real-world example from test",
76 snippet: `<span title="corenlp/p:ART">
77 <span title="marmot/m:case:nom">
78 <span title="marmot/m:gender:masc">
79 <span title="marmot/m:number:sg">
80 <span title="marmot/p:ART">
81 <span title="opennlp/p:ART">
82 <span title="tt/l:die">
83 <span title="tt/p:ART">Der</span>
84 </span>
85 </span>
86 </span>
87 </span>
88 </span>
89 </span>
90 </span>`,
91 expectedTokens: 1,
92 expectedContains: []string{"Der"},
93 },
94 {
95 name: "Empty snippet",
96 snippet: "",
97 expectedTokens: 0,
98 expectedContains: []string{},
99 },
100 {
101 name: "No span elements",
102 snippet: "Just some text",
103 expectedTokens: 0,
104 expectedContains: []string{},
105 },
106 }
107
108 for _, tt := range tests {
109 t.Run(tt.name, func(t *testing.T) {
110 tokens, err := sm.ParseSnippet(tt.snippet)
111 require.NoError(t, err)
112
113 assert.Len(t, tokens, tt.expectedTokens)
114
115 for i, expectedText := range tt.expectedContains {
116 if i < len(tokens) {
117 assert.Equal(t, expectedText, tokens[i].Text)
118 }
119 }
120 })
121 }
122}
123
124func TestSnippetMatcher_CheckToken(t *testing.T) {
125 // Create a pattern that matches tokens with marmot/m:gender=masc
126 pattern := ast.Pattern{
127 Root: &ast.Term{
128 Foundry: "marmot",
129 Layer: "m",
130 Key: "gender",
131 Value: "masc",
132 Match: ast.MatchEqual,
133 },
134 }
135
136 replacement := ast.Replacement{
137 Root: &ast.Term{
138 Foundry: "opennlp",
139 Layer: "m",
140 Key: "M",
141 Value: "",
142 Match: ast.MatchEqual,
143 },
144 }
145
146 sm, err := NewSnippetMatcher(pattern, replacement)
147 require.NoError(t, err)
148
149 tests := []struct {
150 name string
151 token TokenSpan
152 shouldMatch bool
153 }{
154 {
155 name: "Token with matching annotation",
156 token: TokenSpan{
157 Text: "Der",
158 Annotations: []string{
159 "corenlp/p:ART",
160 "marmot/m:case:nom",
161 "marmot/m:gender:masc",
162 "marmot/m:number:sg",
163 },
164 },
165 shouldMatch: true,
166 },
167 {
168 name: "Token without matching annotation",
169 token: TokenSpan{
170 Text: "und",
171 Annotations: []string{
172 "corenlp/p:KON",
173 "marmot/p:KON",
174 "opennlp/p:KON",
175 },
176 },
177 shouldMatch: false,
178 },
179 {
180 name: "Token with no annotations",
181 token: TokenSpan{
182 Text: "text",
183 Annotations: []string{},
184 },
185 shouldMatch: false,
186 },
187 {
188 name: "Token with different gender value",
189 token: TokenSpan{
190 Text: "andere",
191 Annotations: []string{
192 "marmot/m:gender:fem",
193 "marmot/m:case:nom",
194 },
195 },
196 shouldMatch: false,
197 },
198 }
199
200 for _, tt := range tests {
201 t.Run(tt.name, func(t *testing.T) {
202 matches, err := sm.CheckToken(tt.token)
203 require.NoError(t, err)
204 assert.Equal(t, tt.shouldMatch, matches)
205 })
206 }
207}
208
209func TestSnippetMatcher_FindMatchingTokens(t *testing.T) {
210 // Create a pattern that matches tokens with marmot/m:gender=masc
211 pattern := ast.Pattern{
212 Root: &ast.Term{
213 Foundry: "marmot",
214 Layer: "m",
215 Key: "gender",
216 Value: "masc",
217 Match: ast.MatchEqual,
218 },
219 }
220
221 replacement := ast.Replacement{
222 Root: &ast.Term{
223 Foundry: "opennlp",
224 Layer: "m",
225 Key: "M",
226 Value: "",
227 Match: ast.MatchEqual,
228 },
229 }
230
231 sm, err := NewSnippetMatcher(pattern, replacement)
232 require.NoError(t, err)
233
234 // Test snippet with mixed tokens - some matching, some not
235 snippet := `<span title="corenlp/p:ART">
236 <span title="marmot/m:case:nom">
237 <span title="marmot/m:gender:masc">
238 <span title="marmot/m:number:sg">
239 Der</span>
240 </span>
241 </span>
242 </span>
243 <span title="corenlp/p:ADJA">
244 <span title="marmot/m:case:nom">
245 <span title="marmot/m:gender:masc">
246 alte</span>
247 </span>
248 </span>
249 <span title="corenlp/p:NN">
250 <span title="marmot/m:case:nom">
251 <span title="marmot/m:gender:masc">
252 Baum</span>
253 </span>
254 </span>
255 <span title="corenlp/p:KON">
256 <span title="marmot/p:KON">
257 und</span>
258 </span>`
259
260 matchingTokens, err := sm.FindMatchingTokens(snippet)
261 require.NoError(t, err)
262
263 // Should find 3 matching tokens: "Der", "alte", "Baum" (all with gender:masc)
264 // but not "und" (no gender annotation)
265 assert.Len(t, matchingTokens, 3)
266
267 expectedTexts := []string{"Der", "alte", "Baum"}
268 for i, token := range matchingTokens {
269 assert.Equal(t, expectedTexts[i], token.Text)
270
271 // Verify that each token has the required annotation
272 hasGenderMasc := false
273 for _, annotation := range token.Annotations {
274 if annotation == "marmot/m:gender:masc" {
275 hasGenderMasc = true
276 break
277 }
278 }
279 assert.True(t, hasGenderMasc, "Token %s should have marmot/m:gender:masc annotation", token.Text)
280 }
281}
282
283func TestSnippetMatcher_CheckTokenSequence(t *testing.T) {
284 // Create a pattern for testing
285 pattern := ast.Pattern{
286 Root: &ast.Term{
287 Foundry: "marmot",
288 Layer: "m",
289 Key: "gender",
290 Value: "masc",
291 Match: ast.MatchEqual,
292 },
293 }
294
295 replacement := ast.Replacement{
296 Root: &ast.Term{
297 Foundry: "opennlp",
298 Layer: "m",
299 Key: "M",
300 Value: "",
301 Match: ast.MatchEqual,
302 },
303 }
304
305 sm, err := NewSnippetMatcher(pattern, replacement)
306 require.NoError(t, err)
307
308 tests := []struct {
309 name string
310 tokens []TokenSpan
311 shouldMatch bool
312 }{
313 {
314 name: "Sequence with matching token",
315 tokens: []TokenSpan{
316 {
317 Text: "Der",
318 Annotations: []string{
319 "marmot/m:gender:masc",
320 "marmot/m:case:nom",
321 },
322 },
323 {
324 Text: "alte",
325 Annotations: []string{
326 "marmot/m:gender:fem",
327 "marmot/m:case:nom",
328 },
329 },
330 },
331 shouldMatch: true, // First token matches
332 },
333 {
334 name: "Sequence with no matching tokens",
335 tokens: []TokenSpan{
336 {
337 Text: "und",
338 Annotations: []string{
339 "marmot/p:KON",
340 },
341 },
342 {
343 Text: "oder",
344 Annotations: []string{
345 "marmot/p:KON",
346 },
347 },
348 },
349 shouldMatch: false,
350 },
351 {
352 name: "Empty sequence",
353 tokens: []TokenSpan{},
354 shouldMatch: false,
355 },
356 }
357
358 for _, tt := range tests {
359 t.Run(tt.name, func(t *testing.T) {
360 matches, err := sm.CheckTokenSequence(tt.tokens)
361 require.NoError(t, err)
362 assert.Equal(t, tt.shouldMatch, matches)
363 })
364 }
365}
366
367func TestSnippetMatcher_GetReplacement(t *testing.T) {
368 pattern := ast.Pattern{
369 Root: &ast.Term{
370 Foundry: "marmot",
371 Layer: "m",
372 Key: "gender",
373 Value: "masc",
374 Match: ast.MatchEqual,
375 },
376 }
377
378 replacement := ast.Replacement{
379 Root: &ast.Term{
380 Foundry: "opennlp",
381 Layer: "m",
382 Key: "M",
383 Value: "",
384 Match: ast.MatchEqual,
385 },
386 }
387
388 sm, err := NewSnippetMatcher(pattern, replacement)
389 require.NoError(t, err)
390
391 replacementNode := sm.GetReplacement()
392 require.NotNil(t, replacementNode)
393
394 term, ok := replacementNode.(*ast.Term)
395 require.True(t, ok)
396 assert.Equal(t, "opennlp", term.Foundry)
397 assert.Equal(t, "m", term.Layer)
398 assert.Equal(t, "M", term.Key)
399}
400
401func TestSnippetMatcher_RealWorldExample(t *testing.T) {
402 // Test with the real-world example from the response test
403 pattern := ast.Pattern{
404 Root: &ast.Term{
405 Foundry: "marmot",
406 Layer: "m",
407 Key: "gender",
408 Value: "masc",
409 Match: ast.MatchEqual,
410 },
411 }
412
413 replacement := ast.Replacement{
414 Root: &ast.Term{
415 Foundry: "opennlp",
416 Layer: "m",
417 Key: "M",
418 Value: "",
419 Match: ast.MatchEqual,
420 },
421 }
422
423 sm, err := NewSnippetMatcher(pattern, replacement)
424 require.NoError(t, err)
425
426 // Real snippet from the test file
427 snippet := `<span title="corenlp/p:ART">` +
428 `<span title="marmot/m:case:nom">` +
429 `<span title="marmot/m:gender:masc">` +
430 `<span title="marmot/m:number:sg">` +
431 `<span title="marmot/p:ART">` +
432 `<span title="opennlp/p:ART">` +
433 `<span title="tt/l:die">` +
434 `<span title="tt/p:ART">Der</span>` +
435 `</span>` +
436 `</span>` +
437 `</span>` +
438 `</span>` +
439 `</span>` +
440 `</span>` +
441 `</span>`
442
443 // Parse the snippet
444 tokens, err := sm.ParseSnippet(snippet)
445 require.NoError(t, err)
446 require.Len(t, tokens, 1)
447
448 token := tokens[0]
449 assert.Equal(t, "Der", token.Text)
450
451 // Check that it has all expected annotations
452 expectedAnnotations := []string{
453 "corenlp/p:ART",
454 "marmot/m:case:nom",
455 "marmot/m:gender:masc",
456 "marmot/m:number:sg",
457 "marmot/p:ART",
458 "opennlp/p:ART",
459 "tt/l:die",
460 "tt/p:ART",
461 }
462
463 assert.Len(t, token.Annotations, len(expectedAnnotations))
464 for _, expected := range expectedAnnotations {
465 assert.Contains(t, token.Annotations, expected)
466 }
467
468 // Check that it matches our pattern
469 matches, err := sm.CheckToken(token)
470 require.NoError(t, err)
471 assert.True(t, matches)
472}