blob: 9b3a2fd5801c5ffc75e78c1af1cbe5bab0bc5748 [file] [log] [blame]
Akroncc9a8a62025-06-25 11:56:28 +02001package matcher
2
3import (
Akrona8b9fbc2026-03-05 16:43:05 +01004 "slices"
Akroncc9a8a62025-06-25 11:56:28 +02005 "testing"
6
Akron2ef703c2025-07-03 15:57:42 +02007 "github.com/KorAP/Koral-Mapper/ast"
Akroncc9a8a62025-06-25 11:56:28 +02008 "github.com/stretchr/testify/assert"
9 "github.com/stretchr/testify/require"
10)
11
12func TestSnippetMatcher_ParseSnippet(t *testing.T) {
13 // Create a pattern for testing
14 pattern := ast.Pattern{
15 Root: &ast.Term{
16 Foundry: "marmot",
17 Layer: "m",
18 Key: "gender",
19 Value: "masc",
20 Match: ast.MatchEqual,
21 },
22 }
23
24 replacement := ast.Replacement{
25 Root: &ast.Term{
26 Foundry: "opennlp",
27 Layer: "m",
28 Key: "M",
29 Value: "",
30 Match: ast.MatchEqual,
31 },
32 }
33
34 sm, err := NewSnippetMatcher(pattern, replacement)
35 require.NoError(t, err)
36
37 tests := []struct {
38 name string
39 snippet string
40 expectedTokens int
41 expectedContains []string
42 }{
43 {
44 name: "Simple single token",
45 snippet: `<span title="corenlp/p:ART">
46 <span title="marmot/m:case:nom">
47 <span title="marmot/m:gender:masc">
48 <span title="marmot/m:number:sg">
49 <span title="marmot/p:ART">
50 Der</span>
51 </span>
52 </span>
53 </span>
54 </span>`,
55 expectedTokens: 1,
56 expectedContains: []string{"Der"},
57 },
58 {
59 name: "Multiple tokens",
60 snippet: `<span title="corenlp/p:ART">
61 <span title="marmot/m:case:nom">
62 <span title="marmot/m:gender:masc">
63 Der</span>
64 </span>
65 </span>
66 <span title="corenlp/p:ADJA">
67 <span title="marmot/m:case:nom">
68 <span title="marmot/m:gender:masc">
69 alte</span>
70 </span>
71 </span>`,
72 expectedTokens: 2,
73 expectedContains: []string{"Der", "alte"},
74 },
75 {
76 name: "Real-world example from test",
77 snippet: `<span title="corenlp/p:ART">
78 <span title="marmot/m:case:nom">
79 <span title="marmot/m:gender:masc">
80 <span title="marmot/m:number:sg">
81 <span title="marmot/p:ART">
82 <span title="opennlp/p:ART">
83 <span title="tt/l:die">
84 <span title="tt/p:ART">Der</span>
85 </span>
86 </span>
87 </span>
88 </span>
89 </span>
90 </span>
91 </span>`,
92 expectedTokens: 1,
93 expectedContains: []string{"Der"},
94 },
95 {
96 name: "Empty snippet",
97 snippet: "",
98 expectedTokens: 0,
99 expectedContains: []string{},
100 },
101 {
102 name: "No span elements",
103 snippet: "Just some text",
104 expectedTokens: 0,
105 expectedContains: []string{},
106 },
107 }
108
109 for _, tt := range tests {
110 t.Run(tt.name, func(t *testing.T) {
111 tokens, err := sm.ParseSnippet(tt.snippet)
112 require.NoError(t, err)
113
114 assert.Len(t, tokens, tt.expectedTokens)
115
116 for i, expectedText := range tt.expectedContains {
117 if i < len(tokens) {
118 assert.Equal(t, expectedText, tokens[i].Text)
119 }
120 }
121 })
122 }
123}
124
125func TestSnippetMatcher_CheckToken(t *testing.T) {
126 // Create a pattern that matches tokens with marmot/m:gender=masc
127 pattern := ast.Pattern{
128 Root: &ast.Term{
129 Foundry: "marmot",
130 Layer: "m",
131 Key: "gender",
132 Value: "masc",
133 Match: ast.MatchEqual,
134 },
135 }
136
137 replacement := ast.Replacement{
138 Root: &ast.Term{
139 Foundry: "opennlp",
140 Layer: "m",
141 Key: "M",
142 Value: "",
143 Match: ast.MatchEqual,
144 },
145 }
146
147 sm, err := NewSnippetMatcher(pattern, replacement)
148 require.NoError(t, err)
149
150 tests := []struct {
151 name string
152 token TokenSpan
153 shouldMatch bool
154 }{
155 {
156 name: "Token with matching annotation",
157 token: TokenSpan{
158 Text: "Der",
159 Annotations: []string{
160 "corenlp/p:ART",
161 "marmot/m:case:nom",
162 "marmot/m:gender:masc",
163 "marmot/m:number:sg",
164 },
165 },
166 shouldMatch: true,
167 },
168 {
169 name: "Token without matching annotation",
170 token: TokenSpan{
171 Text: "und",
172 Annotations: []string{
173 "corenlp/p:KON",
174 "marmot/p:KON",
175 "opennlp/p:KON",
176 },
177 },
178 shouldMatch: false,
179 },
180 {
181 name: "Token with no annotations",
182 token: TokenSpan{
183 Text: "text",
184 Annotations: []string{},
185 },
186 shouldMatch: false,
187 },
188 {
189 name: "Token with different gender value",
190 token: TokenSpan{
191 Text: "andere",
192 Annotations: []string{
193 "marmot/m:gender:fem",
194 "marmot/m:case:nom",
195 },
196 },
197 shouldMatch: false,
198 },
199 }
200
201 for _, tt := range tests {
202 t.Run(tt.name, func(t *testing.T) {
203 matches, err := sm.CheckToken(tt.token)
204 require.NoError(t, err)
205 assert.Equal(t, tt.shouldMatch, matches)
206 })
207 }
208}
209
210func TestSnippetMatcher_FindMatchingTokens(t *testing.T) {
211 // Create a pattern that matches tokens with marmot/m:gender=masc
212 pattern := ast.Pattern{
213 Root: &ast.Term{
214 Foundry: "marmot",
215 Layer: "m",
216 Key: "gender",
217 Value: "masc",
218 Match: ast.MatchEqual,
219 },
220 }
221
222 replacement := ast.Replacement{
223 Root: &ast.Term{
224 Foundry: "opennlp",
225 Layer: "m",
226 Key: "M",
227 Value: "",
228 Match: ast.MatchEqual,
229 },
230 }
231
232 sm, err := NewSnippetMatcher(pattern, replacement)
233 require.NoError(t, err)
234
235 // Test snippet with mixed tokens - some matching, some not
236 snippet := `<span title="corenlp/p:ART">
237 <span title="marmot/m:case:nom">
238 <span title="marmot/m:gender:masc">
239 <span title="marmot/m:number:sg">
240 Der</span>
241 </span>
242 </span>
243 </span>
244 <span title="corenlp/p:ADJA">
245 <span title="marmot/m:case:nom">
246 <span title="marmot/m:gender:masc">
247 alte</span>
248 </span>
249 </span>
250 <span title="corenlp/p:NN">
251 <span title="marmot/m:case:nom">
252 <span title="marmot/m:gender:masc">
253 Baum</span>
254 </span>
255 </span>
256 <span title="corenlp/p:KON">
257 <span title="marmot/p:KON">
258 und</span>
259 </span>`
260
261 matchingTokens, err := sm.FindMatchingTokens(snippet)
262 require.NoError(t, err)
263
264 // Should find 3 matching tokens: "Der", "alte", "Baum" (all with gender:masc)
265 // but not "und" (no gender annotation)
266 assert.Len(t, matchingTokens, 3)
267
268 expectedTexts := []string{"Der", "alte", "Baum"}
269 for i, token := range matchingTokens {
270 assert.Equal(t, expectedTexts[i], token.Text)
271
272 // Verify that each token has the required annotation
Akrona8b9fbc2026-03-05 16:43:05 +0100273 hasGenderMasc := slices.Contains(token.Annotations, "marmot/m:gender:masc")
Akroncc9a8a62025-06-25 11:56:28 +0200274 assert.True(t, hasGenderMasc, "Token %s should have marmot/m:gender:masc annotation", token.Text)
275 }
276}
277
Akroncc9a8a62025-06-25 11:56:28 +0200278func TestSnippetMatcher_RealWorldExample(t *testing.T) {
279 // Test with the real-world example from the response test
280 pattern := ast.Pattern{
281 Root: &ast.Term{
282 Foundry: "marmot",
283 Layer: "m",
284 Key: "gender",
285 Value: "masc",
286 Match: ast.MatchEqual,
287 },
288 }
289
290 replacement := ast.Replacement{
291 Root: &ast.Term{
292 Foundry: "opennlp",
293 Layer: "m",
294 Key: "M",
295 Value: "",
296 Match: ast.MatchEqual,
297 },
298 }
299
300 sm, err := NewSnippetMatcher(pattern, replacement)
301 require.NoError(t, err)
302
303 // Real snippet from the test file
304 snippet := `<span title="corenlp/p:ART">` +
305 `<span title="marmot/m:case:nom">` +
306 `<span title="marmot/m:gender:masc">` +
307 `<span title="marmot/m:number:sg">` +
308 `<span title="marmot/p:ART">` +
309 `<span title="opennlp/p:ART">` +
310 `<span title="tt/l:die">` +
311 `<span title="tt/p:ART">Der</span>` +
312 `</span>` +
313 `</span>` +
314 `</span>` +
315 `</span>` +
316 `</span>` +
317 `</span>` +
318 `</span>`
319
320 // Parse the snippet
321 tokens, err := sm.ParseSnippet(snippet)
322 require.NoError(t, err)
323 require.Len(t, tokens, 1)
324
325 token := tokens[0]
326 assert.Equal(t, "Der", token.Text)
327
328 // Check that it has all expected annotations
329 expectedAnnotations := []string{
330 "corenlp/p:ART",
331 "marmot/m:case:nom",
332 "marmot/m:gender:masc",
333 "marmot/m:number:sg",
334 "marmot/p:ART",
335 "opennlp/p:ART",
336 "tt/l:die",
337 "tt/p:ART",
338 }
339
340 assert.Len(t, token.Annotations, len(expectedAnnotations))
341 for _, expected := range expectedAnnotations {
342 assert.Contains(t, token.Annotations, expected)
343 }
344
345 // Check that it matches our pattern
346 matches, err := sm.CheckToken(token)
347 require.NoError(t, err)
348 assert.True(t, matches)
349}