Marc Kupietz | b6b3475 | 2025-09-05 17:11:06 +0200 | [diff] [blame] | 1 | test_that("parse_xml_annotations returns empty vectors for empty input", { |
| 2 | res1 <- RKorAPClient:::parse_xml_annotations(NULL) |
| 3 | res2 <- RKorAPClient:::parse_xml_annotations(NA) |
| 4 | res3 <- RKorAPClient:::parse_xml_annotations("") |
| 5 | |
| 6 | for (res in list(res1, res2, res3)) { |
| 7 | expect_true(is.list(res)) |
| 8 | expect_named(res, c("token", "lemma", "pos", "morph")) |
| 9 | expect_length(res$token, 0) |
| 10 | expect_length(res$lemma, 0) |
| 11 | expect_length(res$pos, 0) |
| 12 | expect_length(res$morph, 0) |
| 13 | } |
| 14 | }) |
| 15 | |
| 16 | test_that("parse_xml_annotations extracts tokens/pos/lemma across multiple <mark> blocks", { |
| 17 | xml_snippet <- '<span class="context-left"></span> |
| 18 | <span class="match"> |
| 19 | <span title="tt/l:Wir"><span title="tt/p:PPER">Wir</span></span> |
| 20 | <mark> |
| 21 | <span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span> |
| 22 | </mark> |
| 23 | <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span> |
| 24 | <mark> |
| 25 | <span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span> |
| 26 | <span title="tt/l:Plan"><span title="tt/p:NN">Plan</span></span> |
| 27 | </mark> |
| 28 | </span> |
| 29 | <span class="context-right"></span>' |
| 30 | |
| 31 | parsed <- RKorAPClient:::parse_xml_annotations(xml_snippet) |
| 32 | |
| 33 | expect_equal(parsed$token, c("Wir", "können", "alles", "außer", "Plan")) |
| 34 | expect_equal(parsed$pos, c("PPER", "VVFIN", "PIS", "APPR", "NN")) |
| 35 | expect_equal(parsed$lemma, c("Wir", "können", "alles", "außer", "Plan")) |
| 36 | |
| 37 | # morph not present in snippet; should be NA-aligned to tokens |
| 38 | expect_length(parsed$morph, length(parsed$token)) |
| 39 | expect_true(all(is.na(parsed$morph))) |
| 40 | }) |
| 41 | |
| 42 | test_that("parse_xml_annotations handles missing lemma/pos/morph gracefully", { |
| 43 | # First token has POS only; second has lemma+POS+morph; third has lemma only |
| 44 | xml_snippet <- '<span class="match"> |
| 45 | <span title="tt/p:NN">Haus</span> |
| 46 | <mark><span title="tt/l:können tt/p:VVFIN marmot/m:verbform:fin">können</span></mark> |
| 47 | <span title="tt/l:gehen">gehen</span> |
| 48 | </span>' |
| 49 | |
| 50 | parsed <- RKorAPClient:::parse_xml_annotations(xml_snippet) |
| 51 | |
| 52 | expect_equal(parsed$token, c("Haus", "können", "gehen")) |
| 53 | expect_equal(parsed$pos, c("NN", "VVFIN", NA)) |
| 54 | expect_equal(parsed$lemma, c(NA, "können", "gehen")) |
| 55 | expect_equal(parsed$morph, c(NA, "verbform:fin", NA)) |
| 56 | |
| 57 | # Vectors must be equal length |
| 58 | n <- length(parsed$token) |
| 59 | expect_length(parsed$lemma, n) |
| 60 | expect_length(parsed$pos, n) |
| 61 | expect_length(parsed$morph, n) |
| 62 | }) |
| 63 | |