CA: replace NA dampening factor with missingScoreQuantile
Better use distributional priors instead of raw minima.
Change-Id: I85d77a63c1bc06c8f6398ddac75deec52f1ca96a
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index 9446a4b..67a25cb 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -42,7 +42,7 @@
#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
#' @param collocateFilterRegex allow only collocates matching the regular expression
-#' @param multiVcMissingScoreFactor factor that is multiplied with the minimum observed score when imputing missing scores for delta computations between virtual corpora
+#' @param missingScoreQuantile lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora
#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
#' @param ... more arguments will be passed to [collocationScoreQuery()]
#' @inheritParams collocationScoreQuery,KorAPConnection-method
@@ -99,7 +99,7 @@
threshold = 2.0,
localStopwords = c(),
collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
- multiVcMissingScoreFactor = 0.9,
+ missingScoreQuantile = 0.05,
vcLabel = NA_character_,
...) {
word <- frequency <- O <- NULL
@@ -154,7 +154,7 @@
localStopwords = localStopwords,
seed = seed,
expand = expand,
- multiVcMissingScoreFactor = multiVcMissingScoreFactor,
+ missingScoreQuantile = missingScoreQuantile,
collocateFilterRegex = collocateFilterRegex,
vcLabel = vcLabel,
...
@@ -183,7 +183,10 @@
}
multi_result |>
- add_multi_vc_comparisons(thresholdScore = thresholdScore, missingScoreFactor = multiVcMissingScoreFactor)
+ add_multi_vc_comparisons(
+ thresholdScore = thresholdScore,
+ missingScoreQuantile = missingScoreQuantile
+ )
}
} else {
if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
@@ -259,7 +262,7 @@
searchHitsSampleLimit = searchHitsSampleLimit,
topCollocatesLimit = topCollocatesLimit,
addExamples = FALSE,
- multiVcMissingScoreFactor = multiVcMissingScoreFactor,
+ missingScoreQuantile = missingScoreQuantile,
collocateFilterRegex = collocateFilterRegex,
vcLabel = vcLabel
) |>
@@ -301,7 +304,7 @@
return(res)
}
-add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreFactor) {
+add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreQuantile = 0.05) {
label <- node <- collocate <- NULL
if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
@@ -316,6 +319,67 @@
return(result)
}
+ compute_score_floor <- function(values) {
+ finite_values <- values[is.finite(values)]
+ if (length(finite_values) == 0) {
+ return(0)
+ }
+
+ prob <- min(max(missingScoreQuantile, 0), 0.5)
+ q_val <- suppressWarnings(stats::quantile(finite_values,
+ probs = prob,
+ names = FALSE,
+ type = 7
+ ))
+
+ if (!is.finite(q_val)) {
+ q_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
+ }
+
+ min_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
+ if (!is.finite(min_val)) {
+ min_val <- 0
+ }
+
+ spread_candidates <- c(
+ suppressWarnings(stats::IQR(finite_values, na.rm = TRUE, type = 7)),
+ stats::sd(finite_values, na.rm = TRUE),
+ abs(q_val) * 0.1,
+ abs(min_val - q_val)
+ )
+ spread_candidates <- spread_candidates[is.finite(spread_candidates)]
+
+ spread <- 0
+ if (length(spread_candidates) > 0) {
+ spread <- max(spread_candidates)
+ }
+ if (!is.finite(spread) || spread == 0) {
+ spread <- max(abs(q_val), abs(min_val), 1e-06)
+ }
+
+ candidate <- q_val - spread
+ if (!is.finite(candidate)) {
+ candidate <- min_val
+ }
+
+ floor_value <- suppressWarnings(min(c(candidate, min_val), na.rm = TRUE))
+ if (!is.finite(floor_value)) {
+ floor_value <- min_val
+ }
+ if (!is.finite(floor_value)) {
+ floor_value <- 0
+ }
+
+ floor_value
+ }
+
+ score_replacements <- stats::setNames(
+ vapply(score_cols, function(col) {
+ compute_score_floor(result[[col]])
+ }, numeric(1)),
+ score_cols
+ )
+
comparison <- result |>
dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) |>
tidyr::pivot_wider(
@@ -388,13 +452,26 @@
}
if (length(labels) == 2) {
- fill_scores <- function(x, y) {
- min_val <- suppressWarnings(min(c(x, y), na.rm = TRUE))
- if (!is.finite(min_val)) {
- min_val <- 0
+ fill_scores <- function(x, y, measure_col) {
+ replacement <- score_replacements[[measure_col]]
+ fallback_min <- suppressWarnings(min(c(x, y), na.rm = TRUE))
+ if (!is.finite(fallback_min)) {
+ fallback_min <- 0
}
- x[is.na(x)] <- missingScoreFactor * min_val
- y[is.na(y)] <- missingScoreFactor * min_val
+ if (!is.null(replacement) && is.finite(replacement)) {
+ replacement <- min(replacement, fallback_min)
+ } else {
+ replacement <- fallback_min
+ }
+ if (!is.finite(replacement)) {
+ replacement <- 0
+ }
+ if (any(is.na(x))) {
+ x[is.na(x)] <- replacement
+ }
+ if (any(is.na(y))) {
+ y[is.na(y)] <- replacement
+ }
list(x = x, y = y)
}
@@ -426,7 +503,7 @@
if (!all(c(left_col, right_col) %in% names(comparison))) {
next
}
- filled <- fill_scores(comparison[[left_col]], comparison[[right_col]])
+ filled <- fill_scores(comparison[[left_col]], comparison[[right_col]], col)
comparison[[left_col]] <- filled$x
comparison[[right_col]] <- filled$y
comparison[[paste0("delta_", col)]] <- filled$x - filled$y
@@ -493,11 +570,22 @@
next
}
- min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
- if (!is.finite(min_val)) {
- min_val <- 0
+ replacement <- score_replacements[[col]]
+ fallback_min <- suppressWarnings(min(numeric_row, na.rm = TRUE))
+ if (!is.finite(fallback_min)) {
+ fallback_min <- 0
}
- numeric_row[is.na(numeric_row)] <- missingScoreFactor * min_val
+ if (!is.null(replacement) && is.finite(replacement)) {
+ replacement <- min(replacement, fallback_min)
+ } else {
+ replacement <- fallback_min
+ }
+ if (!is.finite(replacement)) {
+ replacement <- 0
+ }
+ if (any(is.na(numeric_row))) {
+ numeric_row[is.na(numeric_row)] <- replacement
+ }
score_matrix[i, ] <- numeric_row
max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
diff --git a/tests/testthat/test-collocations.R b/tests/testthat/test-collocations.R
index 52a2a66..0705a2b 100644
--- a/tests/testthat/test-collocations.R
+++ b/tests/testthat/test-collocations.R
@@ -121,7 +121,7 @@
pmi = c(2, 3)
)
- enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice", 0.9)
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice")
expect_true(all(c(
"winner_logDice",
@@ -175,7 +175,7 @@
pmi = c(2, 3, 1)
)
- enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice", 0.9)
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice")
expect_equal(enriched$winner_logDice[1], "B")
expect_equal(enriched$winner_logDice_value[1], 8)
expect_equal(enriched$runner_up_logDice[1], "A")
@@ -216,7 +216,7 @@
)
)
- enriched <- RKorAPClient:::add_multi_vc_comparisons(base_tbl, "logDice", 0.9)
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(base_tbl, "logDice")
target_row <- enriched |>
dplyr::filter(collocate == "c1") |>
dplyr::slice_head(n = 1)
@@ -251,7 +251,7 @@
logDice = c(5, NA)
)
- enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice", 0.9)
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice")
expect_equal(enriched$rank_A_logDice[1], 1)
expect_true(is.na(enriched$rank_B_logDice[1]))
@@ -261,6 +261,46 @@
expect_equal(enriched$max_delta_rank_logDice[1], 1)
})
+test_that("adaptive missing score imputation respects measure-specific scales", {
+ sample_result <- tibble::tibble(
+ node = c("n", "n", "n"),
+ collocate = c("c", "c", "c"),
+ vc = c("vc1", "vc2", "vc3"),
+ label = c("A", "B", "C"),
+ N = c(100, 100, 100),
+ O = c(12, 9, 7),
+ O1 = c(60, 40, 30),
+ O2 = c(33, 22, 18),
+ E = c(6, 6, 6),
+ w = c(2, 2, 2),
+ leftContextSize = c(1, 1, 1),
+ rightContextSize = c(1, 1, 1),
+ frequency = c(15, 11, 9),
+ logDice = c(-0.31, NA, -0.12),
+ pmi = c(-1.65, NA, -0.48),
+ ll = c(12.4, NA, 7.9)
+ )
+
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(
+ sample_result,
+ "logDice",
+ missingScoreQuantile = 0.05
+ )
+
+ row_a <- dplyr::filter(enriched, label == "A") |> dplyr::slice_head(n = 1)
+
+ expect_false(is.na(row_a$logDice_B))
+ expect_false(is.na(row_a$pmi_B))
+ expect_false(is.na(row_a$ll_B))
+
+ expect_lt(row_a$logDice_B, min(sample_result$logDice, na.rm = TRUE))
+ expect_lt(row_a$pmi_B, min(sample_result$pmi, na.rm = TRUE))
+ expect_lte(row_a$ll_B, min(sample_result$ll, na.rm = TRUE))
+
+ expect_gt(row_a$max_delta_logDice, 0)
+ expect_gt(row_a$winner_logDice_value - row_a$loser_logDice_value, 0)
+})
+
# New tests for improved coverage of collocationAnalysis.R helper functions
test_that("synsemanticStopwords returns German stopwords", {