CA: replace NA dampening factor with missingScoreQuantile

Better use distributional priors instead of raw minima.

Change-Id: I85d77a63c1bc06c8f6398ddac75deec52f1ca96a
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index 9446a4b..67a25cb 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -42,7 +42,7 @@
 #' @param threshold              minimum value of `thresholdScore` function call to apply collocation analysis recursively
 #' @param localStopwords         vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
 #' @param collocateFilterRegex   allow only collocates matching the regular expression
-#' @param multiVcMissingScoreFactor factor that is multiplied with the minimum observed score when imputing missing scores for delta computations between virtual corpora
+#' @param missingScoreQuantile   lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora
 #' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
 #' @param ...                    more arguments will be passed to [collocationScoreQuery()]
 #' @inheritParams collocationScoreQuery,KorAPConnection-method
@@ -99,7 +99,7 @@
            threshold = 2.0,
            localStopwords = c(),
            collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
-           multiVcMissingScoreFactor = 0.9,
+           missingScoreQuantile = 0.05,
            vcLabel = NA_character_,
            ...) {
     word <- frequency <- O <- NULL
@@ -154,7 +154,7 @@
           localStopwords = localStopwords,
           seed = seed,
           expand = expand,
-          multiVcMissingScoreFactor = multiVcMissingScoreFactor,
+          missingScoreQuantile = missingScoreQuantile,
           collocateFilterRegex = collocateFilterRegex,
           vcLabel = vcLabel,
           ...
@@ -183,7 +183,10 @@
         }
 
         multi_result |>
-          add_multi_vc_comparisons(thresholdScore = thresholdScore, missingScoreFactor = multiVcMissingScoreFactor)
+          add_multi_vc_comparisons(
+            thresholdScore = thresholdScore,
+            missingScoreQuantile = missingScoreQuantile
+          )
       }
     } else {
       if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
@@ -259,7 +262,7 @@
           searchHitsSampleLimit = searchHitsSampleLimit,
           topCollocatesLimit = topCollocatesLimit,
           addExamples = FALSE,
-          multiVcMissingScoreFactor = multiVcMissingScoreFactor,
+          missingScoreQuantile = missingScoreQuantile,
           collocateFilterRegex = collocateFilterRegex,
           vcLabel = vcLabel
         ) |>
@@ -301,7 +304,7 @@
   return(res)
 }
 
-add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreFactor) {
+add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreQuantile = 0.05) {
   label <- node <- collocate <- NULL
 
   if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
@@ -316,6 +319,67 @@
     return(result)
   }
 
+  compute_score_floor <- function(values) {
+    finite_values <- values[is.finite(values)]
+    if (length(finite_values) == 0) {
+      return(0)
+    }
+
+    prob <- min(max(missingScoreQuantile, 0), 0.5)
+    q_val <- suppressWarnings(stats::quantile(finite_values,
+      probs = prob,
+      names = FALSE,
+      type = 7
+    ))
+
+    if (!is.finite(q_val)) {
+      q_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
+    }
+
+    min_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
+    if (!is.finite(min_val)) {
+      min_val <- 0
+    }
+
+    spread_candidates <- c(
+      suppressWarnings(stats::IQR(finite_values, na.rm = TRUE, type = 7)),
+      stats::sd(finite_values, na.rm = TRUE),
+      abs(q_val) * 0.1,
+      abs(min_val - q_val)
+    )
+    spread_candidates <- spread_candidates[is.finite(spread_candidates)]
+
+    spread <- 0
+    if (length(spread_candidates) > 0) {
+      spread <- max(spread_candidates)
+    }
+    if (!is.finite(spread) || spread == 0) {
+      spread <- max(abs(q_val), abs(min_val), 1e-06)
+    }
+
+    candidate <- q_val - spread
+    if (!is.finite(candidate)) {
+      candidate <- min_val
+    }
+
+    floor_value <- suppressWarnings(min(c(candidate, min_val), na.rm = TRUE))
+    if (!is.finite(floor_value)) {
+      floor_value <- min_val
+    }
+    if (!is.finite(floor_value)) {
+      floor_value <- 0
+    }
+
+    floor_value
+  }
+
+  score_replacements <- stats::setNames(
+    vapply(score_cols, function(col) {
+      compute_score_floor(result[[col]])
+    }, numeric(1)),
+    score_cols
+  )
+
   comparison <- result |>
     dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) |>
     tidyr::pivot_wider(
@@ -388,13 +452,26 @@
   }
 
   if (length(labels) == 2) {
-    fill_scores <- function(x, y) {
-      min_val <- suppressWarnings(min(c(x, y), na.rm = TRUE))
-      if (!is.finite(min_val)) {
-        min_val <- 0
+    fill_scores <- function(x, y, measure_col) {
+      replacement <- score_replacements[[measure_col]]
+      fallback_min <- suppressWarnings(min(c(x, y), na.rm = TRUE))
+      if (!is.finite(fallback_min)) {
+        fallback_min <- 0
       }
-      x[is.na(x)] <- missingScoreFactor * min_val
-      y[is.na(y)] <- missingScoreFactor * min_val
+      if (!is.null(replacement) && is.finite(replacement)) {
+        replacement <- min(replacement, fallback_min)
+      } else {
+        replacement <- fallback_min
+      }
+      if (!is.finite(replacement)) {
+        replacement <- 0
+      }
+      if (any(is.na(x))) {
+        x[is.na(x)] <- replacement
+      }
+      if (any(is.na(y))) {
+        y[is.na(y)] <- replacement
+      }
       list(x = x, y = y)
     }
 
@@ -426,7 +503,7 @@
       if (!all(c(left_col, right_col) %in% names(comparison))) {
         next
       }
-      filled <- fill_scores(comparison[[left_col]], comparison[[right_col]])
+  filled <- fill_scores(comparison[[left_col]], comparison[[right_col]], col)
       comparison[[left_col]] <- filled$x
       comparison[[right_col]] <- filled$y
       comparison[[paste0("delta_", col)]] <- filled$x - filled$y
@@ -493,11 +570,22 @@
           next
         }
 
-        min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
-        if (!is.finite(min_val)) {
-          min_val <- 0
+        replacement <- score_replacements[[col]]
+        fallback_min <- suppressWarnings(min(numeric_row, na.rm = TRUE))
+        if (!is.finite(fallback_min)) {
+          fallback_min <- 0
         }
-        numeric_row[is.na(numeric_row)] <- missingScoreFactor * min_val
+        if (!is.null(replacement) && is.finite(replacement)) {
+          replacement <- min(replacement, fallback_min)
+        } else {
+          replacement <- fallback_min
+        }
+        if (!is.finite(replacement)) {
+          replacement <- 0
+        }
+        if (any(is.na(numeric_row))) {
+          numeric_row[is.na(numeric_row)] <- replacement
+        }
         score_matrix[i, ] <- numeric_row
 
         max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
diff --git a/tests/testthat/test-collocations.R b/tests/testthat/test-collocations.R
index 52a2a66..0705a2b 100644
--- a/tests/testthat/test-collocations.R
+++ b/tests/testthat/test-collocations.R
@@ -121,7 +121,7 @@
     pmi = c(2, 3)
   )
 
-  enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice", 0.9)
+  enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice")
 
   expect_true(all(c(
     "winner_logDice",
@@ -175,7 +175,7 @@
     pmi = c(2, 3, 1)
   )
 
-  enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice", 0.9)
+  enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice")
   expect_equal(enriched$winner_logDice[1], "B")
   expect_equal(enriched$winner_logDice_value[1], 8)
   expect_equal(enriched$runner_up_logDice[1], "A")
@@ -216,7 +216,7 @@
       )
     )
 
-  enriched <- RKorAPClient:::add_multi_vc_comparisons(base_tbl, "logDice", 0.9)
+  enriched <- RKorAPClient:::add_multi_vc_comparisons(base_tbl, "logDice")
   target_row <- enriched |>
     dplyr::filter(collocate == "c1") |>
     dplyr::slice_head(n = 1)
@@ -251,7 +251,7 @@
     logDice = c(5, NA)
   )
 
-  enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice", 0.9)
+  enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice")
 
   expect_equal(enriched$rank_A_logDice[1], 1)
   expect_true(is.na(enriched$rank_B_logDice[1]))
@@ -261,6 +261,46 @@
   expect_equal(enriched$max_delta_rank_logDice[1], 1)
 })
 
+test_that("adaptive missing score imputation respects measure-specific scales", {
+  sample_result <- tibble::tibble(
+    node = c("n", "n", "n"),
+    collocate = c("c", "c", "c"),
+    vc = c("vc1", "vc2", "vc3"),
+    label = c("A", "B", "C"),
+    N = c(100, 100, 100),
+    O = c(12, 9, 7),
+    O1 = c(60, 40, 30),
+    O2 = c(33, 22, 18),
+    E = c(6, 6, 6),
+    w = c(2, 2, 2),
+    leftContextSize = c(1, 1, 1),
+    rightContextSize = c(1, 1, 1),
+    frequency = c(15, 11, 9),
+    logDice = c(-0.31, NA, -0.12),
+    pmi = c(-1.65, NA, -0.48),
+    ll = c(12.4, NA, 7.9)
+  )
+
+  enriched <- RKorAPClient:::add_multi_vc_comparisons(
+    sample_result,
+    "logDice",
+    missingScoreQuantile = 0.05
+  )
+
+  row_a <- dplyr::filter(enriched, label == "A") |> dplyr::slice_head(n = 1)
+
+  expect_false(is.na(row_a$logDice_B))
+  expect_false(is.na(row_a$pmi_B))
+  expect_false(is.na(row_a$ll_B))
+
+  expect_lt(row_a$logDice_B, min(sample_result$logDice, na.rm = TRUE))
+  expect_lt(row_a$pmi_B, min(sample_result$pmi, na.rm = TRUE))
+  expect_lte(row_a$ll_B, min(sample_result$ll, na.rm = TRUE))
+
+  expect_gt(row_a$max_delta_logDice, 0)
+  expect_gt(row_a$winner_logDice_value - row_a$loser_logDice_value, 0)
+})
+
 # New tests for improved coverage of collocationAnalysis.R helper functions
 
 test_that("synsemanticStopwords returns German stopwords", {