| Marc Kupietz | b6b3475 | 2025-09-05 17:11:06 +0200 | [diff] [blame] | 1 | test_that("parse_xml_annotations returns empty vectors for empty input", { |
| 2 | res1 <- RKorAPClient:::parse_xml_annotations(NULL) |
| 3 | res2 <- RKorAPClient:::parse_xml_annotations(NA) |
| 4 | res3 <- RKorAPClient:::parse_xml_annotations("") |
| 5 | |
| 6 | for (res in list(res1, res2, res3)) { |
| 7 | expect_true(is.list(res)) |
| 8 | expect_named(res, c("token", "lemma", "pos", "morph")) |
| 9 | expect_length(res$token, 0) |
| 10 | expect_length(res$lemma, 0) |
| 11 | expect_length(res$pos, 0) |
| 12 | expect_length(res$morph, 0) |
| 13 | } |
| 14 | }) |
| 15 | |
| 16 | test_that("parse_xml_annotations extracts tokens/pos/lemma across multiple <mark> blocks", { |
| 17 | xml_snippet <- '<span class="context-left"></span> |
| 18 | <span class="match"> |
| 19 | <span title="tt/l:Wir"><span title="tt/p:PPER">Wir</span></span> |
| 20 | <mark> |
| 21 | <span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span> |
| 22 | </mark> |
| 23 | <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span> |
| 24 | <mark> |
| 25 | <span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span> |
| 26 | <span title="tt/l:Plan"><span title="tt/p:NN">Plan</span></span> |
| 27 | </mark> |
| 28 | </span> |
| 29 | <span class="context-right"></span>' |
| 30 | |
| 31 | parsed <- RKorAPClient:::parse_xml_annotations(xml_snippet) |
| 32 | |
| 33 | expect_equal(parsed$token, c("Wir", "können", "alles", "außer", "Plan")) |
| Marc Kupietz | cd45218 | 2025-10-09 13:28:41 +0200 | [diff] [blame] | 34 | expect_equal(parsed$pos, c("PPER", "VVFIN", "PIS", "APPR", "NN")) |
| Marc Kupietz | b6b3475 | 2025-09-05 17:11:06 +0200 | [diff] [blame] | 35 | expect_equal(parsed$lemma, c("Wir", "können", "alles", "außer", "Plan")) |
| 36 | |
| 37 | # morph not present in snippet; should be NA-aligned to tokens |
| 38 | expect_length(parsed$morph, length(parsed$token)) |
| 39 | expect_true(all(is.na(parsed$morph))) |
| 40 | }) |
| 41 | |
| 42 | test_that("parse_xml_annotations handles missing lemma/pos/morph gracefully", { |
| 43 | # First token has POS only; second has lemma+POS+morph; third has lemma only |
| 44 | xml_snippet <- '<span class="match"> |
| 45 | <span title="tt/p:NN">Haus</span> |
| 46 | <mark><span title="tt/l:können tt/p:VVFIN marmot/m:verbform:fin">können</span></mark> |
| 47 | <span title="tt/l:gehen">gehen</span> |
| 48 | </span>' |
| 49 | |
| 50 | parsed <- RKorAPClient:::parse_xml_annotations(xml_snippet) |
| 51 | |
| 52 | expect_equal(parsed$token, c("Haus", "können", "gehen")) |
| Marc Kupietz | cd45218 | 2025-10-09 13:28:41 +0200 | [diff] [blame] | 53 | expect_equal(parsed$pos, c("NN", "VVFIN", NA)) |
| 54 | expect_equal(parsed$lemma, c(NA, "können", "gehen")) |
| 55 | expect_equal(parsed$morph, c(NA, "verbform:fin", NA)) |
| Marc Kupietz | b6b3475 | 2025-09-05 17:11:06 +0200 | [diff] [blame] | 56 | |
| 57 | # Vectors must be equal length |
| 58 | n <- length(parsed$token) |
| 59 | expect_length(parsed$lemma, n) |
| 60 | expect_length(parsed$pos, n) |
| 61 | expect_length(parsed$morph, n) |
| 62 | }) |
| 63 | |
| Marc Kupietz | cd45218 | 2025-10-09 13:28:41 +0200 | [diff] [blame] | 64 | test_that("parsers retain all morphological features from nested spans", { |
| 65 | xml_snippet <- '<span class="context-left"></span> |
| 66 | <span class="match"> |
| 67 | <mark> |
| 68 | <span title="marmot/m:number:sg"> |
| 69 | <span title="marmot/m:case:* marmot/m:case:fem"> |
| 70 | <span title="tt/l:Ameisenplage tt/p:NN">Ameisenplage</span> |
| 71 | </span> |
| 72 | </span> |
| 73 | </mark> |
| 74 | </span> |
| 75 | <span class="context-right"></span>' |
| 76 | |
| 77 | basic <- RKorAPClient:::parse_xml_annotations(xml_snippet) |
| 78 | structured <- RKorAPClient:::parse_xml_annotations_structured(xml_snippet) |
| 79 | |
| 80 | expect_equal(basic$token, "Ameisenplage") |
| 81 | expect_equal(structured$atokens$match, "Ameisenplage") |
| 82 | |
| 83 | basic_feats <- unlist(strsplit(basic$morph, "\\|")) |
| 84 | structured_feats <- unlist(strsplit(structured$morph$match, "\\|")) |
| 85 | |
| 86 | expect_setequal(basic_feats, c("case:*", "case:fem", "number:sg")) |
| 87 | expect_setequal(structured_feats, c("case:*", "case:fem", "number:sg")) |
| 88 | }) |
| 89 | |
| 90 | test_that("multiple lemma and POS values are preserved", { |
| 91 | xml_snippet <- '<span class="match"> |
| 92 | <mark><span title="tt/l:gehen tt/l:geh tt/p:VVFIN tt/p:VVINF">gehen</span></mark> |
| 93 | </span>' |
| 94 | |
| 95 | basic <- RKorAPClient:::parse_xml_annotations(xml_snippet) |
| 96 | structured <- RKorAPClient:::parse_xml_annotations_structured(xml_snippet) |
| 97 | |
| 98 | expect_equal(basic$lemma, "gehen|geh") |
| 99 | expect_equal(basic$pos, "VVFIN|VVINF") |
| 100 | expect_equal(structured$lemma$match, "gehen|geh") |
| 101 | expect_equal(structured$pos$match, "VVFIN|VVINF") |
| 102 | }) |