blob: 5c9b852144cf231fce2c5b5753a5213d5d521359 [file] [log] [blame]
Akroncc9a8a62025-06-25 11:56:28 +02001package matcher
2
3import (
4 "testing"
5
6 "github.com/KorAP/KoralPipe-TermMapper/ast"
7 "github.com/stretchr/testify/assert"
8 "github.com/stretchr/testify/require"
9)
10
11func TestSnippetMatcher_ParseSnippet(t *testing.T) {
12 // Create a pattern for testing
13 pattern := ast.Pattern{
14 Root: &ast.Term{
15 Foundry: "marmot",
16 Layer: "m",
17 Key: "gender",
18 Value: "masc",
19 Match: ast.MatchEqual,
20 },
21 }
22
23 replacement := ast.Replacement{
24 Root: &ast.Term{
25 Foundry: "opennlp",
26 Layer: "m",
27 Key: "M",
28 Value: "",
29 Match: ast.MatchEqual,
30 },
31 }
32
33 sm, err := NewSnippetMatcher(pattern, replacement)
34 require.NoError(t, err)
35
36 tests := []struct {
37 name string
38 snippet string
39 expectedTokens int
40 expectedContains []string
41 }{
42 {
43 name: "Simple single token",
44 snippet: `<span title="corenlp/p:ART">
45 <span title="marmot/m:case:nom">
46 <span title="marmot/m:gender:masc">
47 <span title="marmot/m:number:sg">
48 <span title="marmot/p:ART">
49 Der</span>
50 </span>
51 </span>
52 </span>
53 </span>`,
54 expectedTokens: 1,
55 expectedContains: []string{"Der"},
56 },
57 {
58 name: "Multiple tokens",
59 snippet: `<span title="corenlp/p:ART">
60 <span title="marmot/m:case:nom">
61 <span title="marmot/m:gender:masc">
62 Der</span>
63 </span>
64 </span>
65 <span title="corenlp/p:ADJA">
66 <span title="marmot/m:case:nom">
67 <span title="marmot/m:gender:masc">
68 alte</span>
69 </span>
70 </span>`,
71 expectedTokens: 2,
72 expectedContains: []string{"Der", "alte"},
73 },
74 {
75 name: "Real-world example from test",
76 snippet: `<span title="corenlp/p:ART">
77 <span title="marmot/m:case:nom">
78 <span title="marmot/m:gender:masc">
79 <span title="marmot/m:number:sg">
80 <span title="marmot/p:ART">
81 <span title="opennlp/p:ART">
82 <span title="tt/l:die">
83 <span title="tt/p:ART">Der</span>
84 </span>
85 </span>
86 </span>
87 </span>
88 </span>
89 </span>
90 </span>`,
91 expectedTokens: 1,
92 expectedContains: []string{"Der"},
93 },
94 {
95 name: "Empty snippet",
96 snippet: "",
97 expectedTokens: 0,
98 expectedContains: []string{},
99 },
100 {
101 name: "No span elements",
102 snippet: "Just some text",
103 expectedTokens: 0,
104 expectedContains: []string{},
105 },
106 }
107
108 for _, tt := range tests {
109 t.Run(tt.name, func(t *testing.T) {
110 tokens, err := sm.ParseSnippet(tt.snippet)
111 require.NoError(t, err)
112
113 assert.Len(t, tokens, tt.expectedTokens)
114
115 for i, expectedText := range tt.expectedContains {
116 if i < len(tokens) {
117 assert.Equal(t, expectedText, tokens[i].Text)
118 }
119 }
120 })
121 }
122}
123
124func TestSnippetMatcher_CheckToken(t *testing.T) {
125 // Create a pattern that matches tokens with marmot/m:gender=masc
126 pattern := ast.Pattern{
127 Root: &ast.Term{
128 Foundry: "marmot",
129 Layer: "m",
130 Key: "gender",
131 Value: "masc",
132 Match: ast.MatchEqual,
133 },
134 }
135
136 replacement := ast.Replacement{
137 Root: &ast.Term{
138 Foundry: "opennlp",
139 Layer: "m",
140 Key: "M",
141 Value: "",
142 Match: ast.MatchEqual,
143 },
144 }
145
146 sm, err := NewSnippetMatcher(pattern, replacement)
147 require.NoError(t, err)
148
149 tests := []struct {
150 name string
151 token TokenSpan
152 shouldMatch bool
153 }{
154 {
155 name: "Token with matching annotation",
156 token: TokenSpan{
157 Text: "Der",
158 Annotations: []string{
159 "corenlp/p:ART",
160 "marmot/m:case:nom",
161 "marmot/m:gender:masc",
162 "marmot/m:number:sg",
163 },
164 },
165 shouldMatch: true,
166 },
167 {
168 name: "Token without matching annotation",
169 token: TokenSpan{
170 Text: "und",
171 Annotations: []string{
172 "corenlp/p:KON",
173 "marmot/p:KON",
174 "opennlp/p:KON",
175 },
176 },
177 shouldMatch: false,
178 },
179 {
180 name: "Token with no annotations",
181 token: TokenSpan{
182 Text: "text",
183 Annotations: []string{},
184 },
185 shouldMatch: false,
186 },
187 {
188 name: "Token with different gender value",
189 token: TokenSpan{
190 Text: "andere",
191 Annotations: []string{
192 "marmot/m:gender:fem",
193 "marmot/m:case:nom",
194 },
195 },
196 shouldMatch: false,
197 },
198 }
199
200 for _, tt := range tests {
201 t.Run(tt.name, func(t *testing.T) {
202 matches, err := sm.CheckToken(tt.token)
203 require.NoError(t, err)
204 assert.Equal(t, tt.shouldMatch, matches)
205 })
206 }
207}
208
209func TestSnippetMatcher_FindMatchingTokens(t *testing.T) {
210 // Create a pattern that matches tokens with marmot/m:gender=masc
211 pattern := ast.Pattern{
212 Root: &ast.Term{
213 Foundry: "marmot",
214 Layer: "m",
215 Key: "gender",
216 Value: "masc",
217 Match: ast.MatchEqual,
218 },
219 }
220
221 replacement := ast.Replacement{
222 Root: &ast.Term{
223 Foundry: "opennlp",
224 Layer: "m",
225 Key: "M",
226 Value: "",
227 Match: ast.MatchEqual,
228 },
229 }
230
231 sm, err := NewSnippetMatcher(pattern, replacement)
232 require.NoError(t, err)
233
234 // Test snippet with mixed tokens - some matching, some not
235 snippet := `<span title="corenlp/p:ART">
236 <span title="marmot/m:case:nom">
237 <span title="marmot/m:gender:masc">
238 <span title="marmot/m:number:sg">
239 Der</span>
240 </span>
241 </span>
242 </span>
243 <span title="corenlp/p:ADJA">
244 <span title="marmot/m:case:nom">
245 <span title="marmot/m:gender:masc">
246 alte</span>
247 </span>
248 </span>
249 <span title="corenlp/p:NN">
250 <span title="marmot/m:case:nom">
251 <span title="marmot/m:gender:masc">
252 Baum</span>
253 </span>
254 </span>
255 <span title="corenlp/p:KON">
256 <span title="marmot/p:KON">
257 und</span>
258 </span>`
259
260 matchingTokens, err := sm.FindMatchingTokens(snippet)
261 require.NoError(t, err)
262
263 // Should find 3 matching tokens: "Der", "alte", "Baum" (all with gender:masc)
264 // but not "und" (no gender annotation)
265 assert.Len(t, matchingTokens, 3)
266
267 expectedTexts := []string{"Der", "alte", "Baum"}
268 for i, token := range matchingTokens {
269 assert.Equal(t, expectedTexts[i], token.Text)
270
271 // Verify that each token has the required annotation
272 hasGenderMasc := false
273 for _, annotation := range token.Annotations {
274 if annotation == "marmot/m:gender:masc" {
275 hasGenderMasc = true
276 break
277 }
278 }
279 assert.True(t, hasGenderMasc, "Token %s should have marmot/m:gender:masc annotation", token.Text)
280 }
281}
282
Akroncc9a8a62025-06-25 11:56:28 +0200283func TestSnippetMatcher_RealWorldExample(t *testing.T) {
284 // Test with the real-world example from the response test
285 pattern := ast.Pattern{
286 Root: &ast.Term{
287 Foundry: "marmot",
288 Layer: "m",
289 Key: "gender",
290 Value: "masc",
291 Match: ast.MatchEqual,
292 },
293 }
294
295 replacement := ast.Replacement{
296 Root: &ast.Term{
297 Foundry: "opennlp",
298 Layer: "m",
299 Key: "M",
300 Value: "",
301 Match: ast.MatchEqual,
302 },
303 }
304
305 sm, err := NewSnippetMatcher(pattern, replacement)
306 require.NoError(t, err)
307
308 // Real snippet from the test file
309 snippet := `<span title="corenlp/p:ART">` +
310 `<span title="marmot/m:case:nom">` +
311 `<span title="marmot/m:gender:masc">` +
312 `<span title="marmot/m:number:sg">` +
313 `<span title="marmot/p:ART">` +
314 `<span title="opennlp/p:ART">` +
315 `<span title="tt/l:die">` +
316 `<span title="tt/p:ART">Der</span>` +
317 `</span>` +
318 `</span>` +
319 `</span>` +
320 `</span>` +
321 `</span>` +
322 `</span>` +
323 `</span>`
324
325 // Parse the snippet
326 tokens, err := sm.ParseSnippet(snippet)
327 require.NoError(t, err)
328 require.Len(t, tokens, 1)
329
330 token := tokens[0]
331 assert.Equal(t, "Der", token.Text)
332
333 // Check that it has all expected annotations
334 expectedAnnotations := []string{
335 "corenlp/p:ART",
336 "marmot/m:case:nom",
337 "marmot/m:gender:masc",
338 "marmot/m:number:sg",
339 "marmot/p:ART",
340 "opennlp/p:ART",
341 "tt/l:die",
342 "tt/p:ART",
343 }
344
345 assert.Len(t, token.Annotations, len(expectedAnnotations))
346 for _, expected := range expectedAnnotations {
347 assert.Contains(t, token.Annotations, expected)
348 }
349
350 // Check that it matches our pattern
351 matches, err := sm.CheckToken(token)
352 require.NoError(t, err)
353 assert.True(t, matches)
354}