blob: 4f65d573d6edb6ad67c3ecae8686592bc4aac8cf [file] [log] [blame]
Marc Kupietzb6b34752025-09-05 17:11:06 +02001test_that("parse_xml_annotations returns empty vectors for empty input", {
2 res1 <- RKorAPClient:::parse_xml_annotations(NULL)
3 res2 <- RKorAPClient:::parse_xml_annotations(NA)
4 res3 <- RKorAPClient:::parse_xml_annotations("")
5
6 for (res in list(res1, res2, res3)) {
7 expect_true(is.list(res))
8 expect_named(res, c("token", "lemma", "pos", "morph"))
9 expect_length(res$token, 0)
10 expect_length(res$lemma, 0)
11 expect_length(res$pos, 0)
12 expect_length(res$morph, 0)
13 }
14})
15
16test_that("parse_xml_annotations extracts tokens/pos/lemma across multiple <mark> blocks", {
17 xml_snippet <- '<span class="context-left"></span>
18 <span class="match">
19 <span title="tt/l:Wir"><span title="tt/p:PPER">Wir</span></span>
20 <mark>
21 <span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span>
22 </mark>
23 <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span>
24 <mark>
25 <span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span>
26 <span title="tt/l:Plan"><span title="tt/p:NN">Plan</span></span>
27 </mark>
28 </span>
29 <span class="context-right"></span>'
30
31 parsed <- RKorAPClient:::parse_xml_annotations(xml_snippet)
32
33 expect_equal(parsed$token, c("Wir", "können", "alles", "außer", "Plan"))
Marc Kupietzcd452182025-10-09 13:28:41 +020034 expect_equal(parsed$pos, c("PPER", "VVFIN", "PIS", "APPR", "NN"))
Marc Kupietzb6b34752025-09-05 17:11:06 +020035 expect_equal(parsed$lemma, c("Wir", "können", "alles", "außer", "Plan"))
36
37 # morph not present in snippet; should be NA-aligned to tokens
38 expect_length(parsed$morph, length(parsed$token))
39 expect_true(all(is.na(parsed$morph)))
40})
41
42test_that("parse_xml_annotations handles missing lemma/pos/morph gracefully", {
43 # First token has POS only; second has lemma+POS+morph; third has lemma only
44 xml_snippet <- '<span class="match">
45 <span title="tt/p:NN">Haus</span>
46 <mark><span title="tt/l:können tt/p:VVFIN marmot/m:verbform:fin">können</span></mark>
47 <span title="tt/l:gehen">gehen</span>
48 </span>'
49
50 parsed <- RKorAPClient:::parse_xml_annotations(xml_snippet)
51
52 expect_equal(parsed$token, c("Haus", "können", "gehen"))
Marc Kupietzcd452182025-10-09 13:28:41 +020053 expect_equal(parsed$pos, c("NN", "VVFIN", NA))
54 expect_equal(parsed$lemma, c(NA, "können", "gehen"))
55 expect_equal(parsed$morph, c(NA, "verbform:fin", NA))
Marc Kupietzb6b34752025-09-05 17:11:06 +020056
57 # Vectors must be equal length
58 n <- length(parsed$token)
59 expect_length(parsed$lemma, n)
60 expect_length(parsed$pos, n)
61 expect_length(parsed$morph, n)
62})
63
Marc Kupietzcd452182025-10-09 13:28:41 +020064test_that("parsers retain all morphological features from nested spans", {
65 xml_snippet <- '<span class="context-left"></span>
66 <span class="match">
67 <mark>
68 <span title="marmot/m:number:sg">
69 <span title="marmot/m:case:* marmot/m:case:fem">
70 <span title="tt/l:Ameisenplage tt/p:NN">Ameisenplage</span>
71 </span>
72 </span>
73 </mark>
74 </span>
75 <span class="context-right"></span>'
76
77 basic <- RKorAPClient:::parse_xml_annotations(xml_snippet)
78 structured <- RKorAPClient:::parse_xml_annotations_structured(xml_snippet)
79
80 expect_equal(basic$token, "Ameisenplage")
81 expect_equal(structured$atokens$match, "Ameisenplage")
82
83 basic_feats <- unlist(strsplit(basic$morph, "\\|"))
84 structured_feats <- unlist(strsplit(structured$morph$match, "\\|"))
85
86 expect_setequal(basic_feats, c("case:*", "case:fem", "number:sg"))
87 expect_setequal(structured_feats, c("case:*", "case:fem", "number:sg"))
88})
89
90test_that("multiple lemma and POS values are preserved", {
91 xml_snippet <- '<span class="match">
92 <mark><span title="tt/l:gehen tt/l:geh tt/p:VVFIN tt/p:VVINF">gehen</span></mark>
93 </span>'
94
95 basic <- RKorAPClient:::parse_xml_annotations(xml_snippet)
96 structured <- RKorAPClient:::parse_xml_annotations_structured(xml_snippet)
97
98 expect_equal(basic$lemma, "gehen|geh")
99 expect_equal(basic$pos, "VVFIN|VVINF")
100 expect_equal(structured$lemma$match, "gehen|geh")
101 expect_equal(structured$pos$match, "VVFIN|VVINF")
102})