Akron | cc9a8a6 | 2025-06-25 11:56:28 +0200 | [diff] [blame^] | 1 | package matcher |
| 2 | |
| 3 | import ( |
| 4 | "testing" |
| 5 | |
| 6 | "github.com/KorAP/KoralPipe-TermMapper/ast" |
| 7 | "github.com/stretchr/testify/assert" |
| 8 | "github.com/stretchr/testify/require" |
| 9 | ) |
| 10 | |
| 11 | func TestSnippetMatcher_ParseSnippet(t *testing.T) { |
| 12 | // Create a pattern for testing |
| 13 | pattern := ast.Pattern{ |
| 14 | Root: &ast.Term{ |
| 15 | Foundry: "marmot", |
| 16 | Layer: "m", |
| 17 | Key: "gender", |
| 18 | Value: "masc", |
| 19 | Match: ast.MatchEqual, |
| 20 | }, |
| 21 | } |
| 22 | |
| 23 | replacement := ast.Replacement{ |
| 24 | Root: &ast.Term{ |
| 25 | Foundry: "opennlp", |
| 26 | Layer: "m", |
| 27 | Key: "M", |
| 28 | Value: "", |
| 29 | Match: ast.MatchEqual, |
| 30 | }, |
| 31 | } |
| 32 | |
| 33 | sm, err := NewSnippetMatcher(pattern, replacement) |
| 34 | require.NoError(t, err) |
| 35 | |
| 36 | tests := []struct { |
| 37 | name string |
| 38 | snippet string |
| 39 | expectedTokens int |
| 40 | expectedContains []string |
| 41 | }{ |
| 42 | { |
| 43 | name: "Simple single token", |
| 44 | snippet: `<span title="corenlp/p:ART"> |
| 45 | <span title="marmot/m:case:nom"> |
| 46 | <span title="marmot/m:gender:masc"> |
| 47 | <span title="marmot/m:number:sg"> |
| 48 | <span title="marmot/p:ART"> |
| 49 | Der</span> |
| 50 | </span> |
| 51 | </span> |
| 52 | </span> |
| 53 | </span>`, |
| 54 | expectedTokens: 1, |
| 55 | expectedContains: []string{"Der"}, |
| 56 | }, |
| 57 | { |
| 58 | name: "Multiple tokens", |
| 59 | snippet: `<span title="corenlp/p:ART"> |
| 60 | <span title="marmot/m:case:nom"> |
| 61 | <span title="marmot/m:gender:masc"> |
| 62 | Der</span> |
| 63 | </span> |
| 64 | </span> |
| 65 | <span title="corenlp/p:ADJA"> |
| 66 | <span title="marmot/m:case:nom"> |
| 67 | <span title="marmot/m:gender:masc"> |
| 68 | alte</span> |
| 69 | </span> |
| 70 | </span>`, |
| 71 | expectedTokens: 2, |
| 72 | expectedContains: []string{"Der", "alte"}, |
| 73 | }, |
| 74 | { |
| 75 | name: "Real-world example from test", |
| 76 | snippet: `<span title="corenlp/p:ART"> |
| 77 | <span title="marmot/m:case:nom"> |
| 78 | <span title="marmot/m:gender:masc"> |
| 79 | <span title="marmot/m:number:sg"> |
| 80 | <span title="marmot/p:ART"> |
| 81 | <span title="opennlp/p:ART"> |
| 82 | <span title="tt/l:die"> |
| 83 | <span title="tt/p:ART">Der</span> |
| 84 | </span> |
| 85 | </span> |
| 86 | </span> |
| 87 | </span> |
| 88 | </span> |
| 89 | </span> |
| 90 | </span>`, |
| 91 | expectedTokens: 1, |
| 92 | expectedContains: []string{"Der"}, |
| 93 | }, |
| 94 | { |
| 95 | name: "Empty snippet", |
| 96 | snippet: "", |
| 97 | expectedTokens: 0, |
| 98 | expectedContains: []string{}, |
| 99 | }, |
| 100 | { |
| 101 | name: "No span elements", |
| 102 | snippet: "Just some text", |
| 103 | expectedTokens: 0, |
| 104 | expectedContains: []string{}, |
| 105 | }, |
| 106 | } |
| 107 | |
| 108 | for _, tt := range tests { |
| 109 | t.Run(tt.name, func(t *testing.T) { |
| 110 | tokens, err := sm.ParseSnippet(tt.snippet) |
| 111 | require.NoError(t, err) |
| 112 | |
| 113 | assert.Len(t, tokens, tt.expectedTokens) |
| 114 | |
| 115 | for i, expectedText := range tt.expectedContains { |
| 116 | if i < len(tokens) { |
| 117 | assert.Equal(t, expectedText, tokens[i].Text) |
| 118 | } |
| 119 | } |
| 120 | }) |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | func TestSnippetMatcher_CheckToken(t *testing.T) { |
| 125 | // Create a pattern that matches tokens with marmot/m:gender=masc |
| 126 | pattern := ast.Pattern{ |
| 127 | Root: &ast.Term{ |
| 128 | Foundry: "marmot", |
| 129 | Layer: "m", |
| 130 | Key: "gender", |
| 131 | Value: "masc", |
| 132 | Match: ast.MatchEqual, |
| 133 | }, |
| 134 | } |
| 135 | |
| 136 | replacement := ast.Replacement{ |
| 137 | Root: &ast.Term{ |
| 138 | Foundry: "opennlp", |
| 139 | Layer: "m", |
| 140 | Key: "M", |
| 141 | Value: "", |
| 142 | Match: ast.MatchEqual, |
| 143 | }, |
| 144 | } |
| 145 | |
| 146 | sm, err := NewSnippetMatcher(pattern, replacement) |
| 147 | require.NoError(t, err) |
| 148 | |
| 149 | tests := []struct { |
| 150 | name string |
| 151 | token TokenSpan |
| 152 | shouldMatch bool |
| 153 | }{ |
| 154 | { |
| 155 | name: "Token with matching annotation", |
| 156 | token: TokenSpan{ |
| 157 | Text: "Der", |
| 158 | Annotations: []string{ |
| 159 | "corenlp/p:ART", |
| 160 | "marmot/m:case:nom", |
| 161 | "marmot/m:gender:masc", |
| 162 | "marmot/m:number:sg", |
| 163 | }, |
| 164 | }, |
| 165 | shouldMatch: true, |
| 166 | }, |
| 167 | { |
| 168 | name: "Token without matching annotation", |
| 169 | token: TokenSpan{ |
| 170 | Text: "und", |
| 171 | Annotations: []string{ |
| 172 | "corenlp/p:KON", |
| 173 | "marmot/p:KON", |
| 174 | "opennlp/p:KON", |
| 175 | }, |
| 176 | }, |
| 177 | shouldMatch: false, |
| 178 | }, |
| 179 | { |
| 180 | name: "Token with no annotations", |
| 181 | token: TokenSpan{ |
| 182 | Text: "text", |
| 183 | Annotations: []string{}, |
| 184 | }, |
| 185 | shouldMatch: false, |
| 186 | }, |
| 187 | { |
| 188 | name: "Token with different gender value", |
| 189 | token: TokenSpan{ |
| 190 | Text: "andere", |
| 191 | Annotations: []string{ |
| 192 | "marmot/m:gender:fem", |
| 193 | "marmot/m:case:nom", |
| 194 | }, |
| 195 | }, |
| 196 | shouldMatch: false, |
| 197 | }, |
| 198 | } |
| 199 | |
| 200 | for _, tt := range tests { |
| 201 | t.Run(tt.name, func(t *testing.T) { |
| 202 | matches, err := sm.CheckToken(tt.token) |
| 203 | require.NoError(t, err) |
| 204 | assert.Equal(t, tt.shouldMatch, matches) |
| 205 | }) |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | func TestSnippetMatcher_FindMatchingTokens(t *testing.T) { |
| 210 | // Create a pattern that matches tokens with marmot/m:gender=masc |
| 211 | pattern := ast.Pattern{ |
| 212 | Root: &ast.Term{ |
| 213 | Foundry: "marmot", |
| 214 | Layer: "m", |
| 215 | Key: "gender", |
| 216 | Value: "masc", |
| 217 | Match: ast.MatchEqual, |
| 218 | }, |
| 219 | } |
| 220 | |
| 221 | replacement := ast.Replacement{ |
| 222 | Root: &ast.Term{ |
| 223 | Foundry: "opennlp", |
| 224 | Layer: "m", |
| 225 | Key: "M", |
| 226 | Value: "", |
| 227 | Match: ast.MatchEqual, |
| 228 | }, |
| 229 | } |
| 230 | |
| 231 | sm, err := NewSnippetMatcher(pattern, replacement) |
| 232 | require.NoError(t, err) |
| 233 | |
| 234 | // Test snippet with mixed tokens - some matching, some not |
| 235 | snippet := `<span title="corenlp/p:ART"> |
| 236 | <span title="marmot/m:case:nom"> |
| 237 | <span title="marmot/m:gender:masc"> |
| 238 | <span title="marmot/m:number:sg"> |
| 239 | Der</span> |
| 240 | </span> |
| 241 | </span> |
| 242 | </span> |
| 243 | <span title="corenlp/p:ADJA"> |
| 244 | <span title="marmot/m:case:nom"> |
| 245 | <span title="marmot/m:gender:masc"> |
| 246 | alte</span> |
| 247 | </span> |
| 248 | </span> |
| 249 | <span title="corenlp/p:NN"> |
| 250 | <span title="marmot/m:case:nom"> |
| 251 | <span title="marmot/m:gender:masc"> |
| 252 | Baum</span> |
| 253 | </span> |
| 254 | </span> |
| 255 | <span title="corenlp/p:KON"> |
| 256 | <span title="marmot/p:KON"> |
| 257 | und</span> |
| 258 | </span>` |
| 259 | |
| 260 | matchingTokens, err := sm.FindMatchingTokens(snippet) |
| 261 | require.NoError(t, err) |
| 262 | |
| 263 | // Should find 3 matching tokens: "Der", "alte", "Baum" (all with gender:masc) |
| 264 | // but not "und" (no gender annotation) |
| 265 | assert.Len(t, matchingTokens, 3) |
| 266 | |
| 267 | expectedTexts := []string{"Der", "alte", "Baum"} |
| 268 | for i, token := range matchingTokens { |
| 269 | assert.Equal(t, expectedTexts[i], token.Text) |
| 270 | |
| 271 | // Verify that each token has the required annotation |
| 272 | hasGenderMasc := false |
| 273 | for _, annotation := range token.Annotations { |
| 274 | if annotation == "marmot/m:gender:masc" { |
| 275 | hasGenderMasc = true |
| 276 | break |
| 277 | } |
| 278 | } |
| 279 | assert.True(t, hasGenderMasc, "Token %s should have marmot/m:gender:masc annotation", token.Text) |
| 280 | } |
| 281 | } |
| 282 | |
| 283 | func TestSnippetMatcher_CheckTokenSequence(t *testing.T) { |
| 284 | // Create a pattern for testing |
| 285 | pattern := ast.Pattern{ |
| 286 | Root: &ast.Term{ |
| 287 | Foundry: "marmot", |
| 288 | Layer: "m", |
| 289 | Key: "gender", |
| 290 | Value: "masc", |
| 291 | Match: ast.MatchEqual, |
| 292 | }, |
| 293 | } |
| 294 | |
| 295 | replacement := ast.Replacement{ |
| 296 | Root: &ast.Term{ |
| 297 | Foundry: "opennlp", |
| 298 | Layer: "m", |
| 299 | Key: "M", |
| 300 | Value: "", |
| 301 | Match: ast.MatchEqual, |
| 302 | }, |
| 303 | } |
| 304 | |
| 305 | sm, err := NewSnippetMatcher(pattern, replacement) |
| 306 | require.NoError(t, err) |
| 307 | |
| 308 | tests := []struct { |
| 309 | name string |
| 310 | tokens []TokenSpan |
| 311 | shouldMatch bool |
| 312 | }{ |
| 313 | { |
| 314 | name: "Sequence with matching token", |
| 315 | tokens: []TokenSpan{ |
| 316 | { |
| 317 | Text: "Der", |
| 318 | Annotations: []string{ |
| 319 | "marmot/m:gender:masc", |
| 320 | "marmot/m:case:nom", |
| 321 | }, |
| 322 | }, |
| 323 | { |
| 324 | Text: "alte", |
| 325 | Annotations: []string{ |
| 326 | "marmot/m:gender:fem", |
| 327 | "marmot/m:case:nom", |
| 328 | }, |
| 329 | }, |
| 330 | }, |
| 331 | shouldMatch: true, // First token matches |
| 332 | }, |
| 333 | { |
| 334 | name: "Sequence with no matching tokens", |
| 335 | tokens: []TokenSpan{ |
| 336 | { |
| 337 | Text: "und", |
| 338 | Annotations: []string{ |
| 339 | "marmot/p:KON", |
| 340 | }, |
| 341 | }, |
| 342 | { |
| 343 | Text: "oder", |
| 344 | Annotations: []string{ |
| 345 | "marmot/p:KON", |
| 346 | }, |
| 347 | }, |
| 348 | }, |
| 349 | shouldMatch: false, |
| 350 | }, |
| 351 | { |
| 352 | name: "Empty sequence", |
| 353 | tokens: []TokenSpan{}, |
| 354 | shouldMatch: false, |
| 355 | }, |
| 356 | } |
| 357 | |
| 358 | for _, tt := range tests { |
| 359 | t.Run(tt.name, func(t *testing.T) { |
| 360 | matches, err := sm.CheckTokenSequence(tt.tokens) |
| 361 | require.NoError(t, err) |
| 362 | assert.Equal(t, tt.shouldMatch, matches) |
| 363 | }) |
| 364 | } |
| 365 | } |
| 366 | |
| 367 | func TestSnippetMatcher_GetReplacement(t *testing.T) { |
| 368 | pattern := ast.Pattern{ |
| 369 | Root: &ast.Term{ |
| 370 | Foundry: "marmot", |
| 371 | Layer: "m", |
| 372 | Key: "gender", |
| 373 | Value: "masc", |
| 374 | Match: ast.MatchEqual, |
| 375 | }, |
| 376 | } |
| 377 | |
| 378 | replacement := ast.Replacement{ |
| 379 | Root: &ast.Term{ |
| 380 | Foundry: "opennlp", |
| 381 | Layer: "m", |
| 382 | Key: "M", |
| 383 | Value: "", |
| 384 | Match: ast.MatchEqual, |
| 385 | }, |
| 386 | } |
| 387 | |
| 388 | sm, err := NewSnippetMatcher(pattern, replacement) |
| 389 | require.NoError(t, err) |
| 390 | |
| 391 | replacementNode := sm.GetReplacement() |
| 392 | require.NotNil(t, replacementNode) |
| 393 | |
| 394 | term, ok := replacementNode.(*ast.Term) |
| 395 | require.True(t, ok) |
| 396 | assert.Equal(t, "opennlp", term.Foundry) |
| 397 | assert.Equal(t, "m", term.Layer) |
| 398 | assert.Equal(t, "M", term.Key) |
| 399 | } |
| 400 | |
| 401 | func TestSnippetMatcher_RealWorldExample(t *testing.T) { |
| 402 | // Test with the real-world example from the response test |
| 403 | pattern := ast.Pattern{ |
| 404 | Root: &ast.Term{ |
| 405 | Foundry: "marmot", |
| 406 | Layer: "m", |
| 407 | Key: "gender", |
| 408 | Value: "masc", |
| 409 | Match: ast.MatchEqual, |
| 410 | }, |
| 411 | } |
| 412 | |
| 413 | replacement := ast.Replacement{ |
| 414 | Root: &ast.Term{ |
| 415 | Foundry: "opennlp", |
| 416 | Layer: "m", |
| 417 | Key: "M", |
| 418 | Value: "", |
| 419 | Match: ast.MatchEqual, |
| 420 | }, |
| 421 | } |
| 422 | |
| 423 | sm, err := NewSnippetMatcher(pattern, replacement) |
| 424 | require.NoError(t, err) |
| 425 | |
| 426 | // Real snippet from the test file |
| 427 | snippet := `<span title="corenlp/p:ART">` + |
| 428 | `<span title="marmot/m:case:nom">` + |
| 429 | `<span title="marmot/m:gender:masc">` + |
| 430 | `<span title="marmot/m:number:sg">` + |
| 431 | `<span title="marmot/p:ART">` + |
| 432 | `<span title="opennlp/p:ART">` + |
| 433 | `<span title="tt/l:die">` + |
| 434 | `<span title="tt/p:ART">Der</span>` + |
| 435 | `</span>` + |
| 436 | `</span>` + |
| 437 | `</span>` + |
| 438 | `</span>` + |
| 439 | `</span>` + |
| 440 | `</span>` + |
| 441 | `</span>` |
| 442 | |
| 443 | // Parse the snippet |
| 444 | tokens, err := sm.ParseSnippet(snippet) |
| 445 | require.NoError(t, err) |
| 446 | require.Len(t, tokens, 1) |
| 447 | |
| 448 | token := tokens[0] |
| 449 | assert.Equal(t, "Der", token.Text) |
| 450 | |
| 451 | // Check that it has all expected annotations |
| 452 | expectedAnnotations := []string{ |
| 453 | "corenlp/p:ART", |
| 454 | "marmot/m:case:nom", |
| 455 | "marmot/m:gender:masc", |
| 456 | "marmot/m:number:sg", |
| 457 | "marmot/p:ART", |
| 458 | "opennlp/p:ART", |
| 459 | "tt/l:die", |
| 460 | "tt/p:ART", |
| 461 | } |
| 462 | |
| 463 | assert.Len(t, token.Annotations, len(expectedAnnotations)) |
| 464 | for _, expected := range expectedAnnotations { |
| 465 | assert.Contains(t, token.Annotations, expected) |
| 466 | } |
| 467 | |
| 468 | // Check that it matches our pattern |
| 469 | matches, err := sm.CheckToken(token) |
| 470 | require.NoError(t, err) |
| 471 | assert.True(t, matches) |
| 472 | } |