CA: replace NA dampening factor with missingScoreQuantile
Better use distributional priors instead of raw minima.
Change-Id: I85d77a63c1bc06c8f6398ddac75deec52f1ca96a
diff --git a/tests/testthat/test-collocations.R b/tests/testthat/test-collocations.R
index 52a2a66..0705a2b 100644
--- a/tests/testthat/test-collocations.R
+++ b/tests/testthat/test-collocations.R
@@ -121,7 +121,7 @@
pmi = c(2, 3)
)
- enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice", 0.9)
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice")
expect_true(all(c(
"winner_logDice",
@@ -175,7 +175,7 @@
pmi = c(2, 3, 1)
)
- enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice", 0.9)
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice")
expect_equal(enriched$winner_logDice[1], "B")
expect_equal(enriched$winner_logDice_value[1], 8)
expect_equal(enriched$runner_up_logDice[1], "A")
@@ -216,7 +216,7 @@
)
)
- enriched <- RKorAPClient:::add_multi_vc_comparisons(base_tbl, "logDice", 0.9)
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(base_tbl, "logDice")
target_row <- enriched |>
dplyr::filter(collocate == "c1") |>
dplyr::slice_head(n = 1)
@@ -251,7 +251,7 @@
logDice = c(5, NA)
)
- enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice", 0.9)
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice")
expect_equal(enriched$rank_A_logDice[1], 1)
expect_true(is.na(enriched$rank_B_logDice[1]))
@@ -261,6 +261,46 @@
expect_equal(enriched$max_delta_rank_logDice[1], 1)
})
+test_that("adaptive missing score imputation respects measure-specific scales", {
+ sample_result <- tibble::tibble(
+ node = c("n", "n", "n"),
+ collocate = c("c", "c", "c"),
+ vc = c("vc1", "vc2", "vc3"),
+ label = c("A", "B", "C"),
+ N = c(100, 100, 100),
+ O = c(12, 9, 7),
+ O1 = c(60, 40, 30),
+ O2 = c(33, 22, 18),
+ E = c(6, 6, 6),
+ w = c(2, 2, 2),
+ leftContextSize = c(1, 1, 1),
+ rightContextSize = c(1, 1, 1),
+ frequency = c(15, 11, 9),
+ logDice = c(-0.31, NA, -0.12),
+ pmi = c(-1.65, NA, -0.48),
+ ll = c(12.4, NA, 7.9)
+ )
+
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(
+ sample_result,
+ "logDice",
+ missingScoreQuantile = 0.05
+ )
+
+ row_a <- dplyr::filter(enriched, label == "A") |> dplyr::slice_head(n = 1)
+
+ expect_false(is.na(row_a$logDice_B))
+ expect_false(is.na(row_a$pmi_B))
+ expect_false(is.na(row_a$ll_B))
+
+ expect_lt(row_a$logDice_B, min(sample_result$logDice, na.rm = TRUE))
+ expect_lt(row_a$pmi_B, min(sample_result$pmi, na.rm = TRUE))
+ expect_lte(row_a$ll_B, min(sample_result$ll, na.rm = TRUE))
+
+ expect_gt(row_a$max_delta_logDice, 0)
+ expect_gt(row_a$winner_logDice_value - row_a$loser_logDice_value, 0)
+})
+
# New tests for improved coverage of collocationAnalysis.R helper functions
test_that("synsemanticStopwords returns German stopwords", {