CA: add queryMissingScores parameter

If TRUE, attempt to retrieve corpus-based association scores for
vc/collocate combinations that would otherwise be imputed, by
re-querying the KorAP backend without applying the collocate frequency
threshold.

Change-Id: I7d2707e16d6efab3515e4cebef4985e8b780a116
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index f373104..d314b68 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -42,6 +42,7 @@
 #' @param threshold              minimum value of `thresholdScore` function call to apply collocation analysis recursively
 #' @param localStopwords         vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
 #' @param collocateFilterRegex   allow only collocates matching the regular expression
+#' @param queryMissingScores     if TRUE, attempt to retrieve corpus-based association scores for vc/collocate combinations that would otherwise be imputed, by re-querying the KorAP backend without applying the collocate frequency threshold
 #' @param missingScoreQuantile   lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora; a robust spread is subtracted from this anchor so the imputed values stay below the weakest observed scores
 #' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
 #' @param ...                    more arguments will be passed to [collocationScoreQuery()]
@@ -111,6 +112,7 @@
            threshold = 2.0,
            localStopwords = c(),
            collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
+           queryMissingScores = FALSE,
            missingScoreQuantile = 0.05,
            vcLabel = NA_character_,
            ...) {
@@ -167,6 +169,7 @@
           seed = seed,
           expand = expand,
           missingScoreQuantile = missingScoreQuantile,
+          queryMissingScores = queryMissingScores,
           collocateFilterRegex = collocateFilterRegex,
           vcLabel = vcLabel,
           ...
@@ -177,6 +180,16 @@
       if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
         multi_result
       } else {
+        if (queryMissingScores) {
+          multi_result <- backfill_missing_scores(
+            multi_result,
+            grid = grid,
+            kco = kco,
+            ignoreCollocateCase = ignoreCollocateCase,
+            ...
+          )
+        }
+
         if (!"label" %in% names(multi_result)) {
           multi_result$label <- NA_character_
         }
@@ -275,6 +288,7 @@
           addExamples = FALSE,
           missingScoreQuantile = missingScoreQuantile,
           collocateFilterRegex = collocateFilterRegex,
+          queryMissingScores = queryMissingScores,
           vcLabel = vcLabel
         ) |>
           bind_rows(result) |>
@@ -315,6 +329,107 @@
   return(res)
 }
 
+backfill_missing_scores <- function(result,
+                                    grid,
+                                    kco,
+                                    ignoreCollocateCase,
+                                    ...) {
+  if (!"vc" %in% names(result) || !"node" %in% names(result) || !"collocate" %in% names(result)) {
+    return(result)
+  }
+
+  if (nrow(result) == 0) {
+    return(result)
+  }
+
+  distinct_pairs <- dplyr::distinct(result, node, collocate)
+  if (nrow(distinct_pairs) == 0) {
+    return(result)
+  }
+
+  collocates_by_node <- split(as.character(distinct_pairs$collocate), distinct_pairs$node)
+  if (length(collocates_by_node) == 0) {
+    return(result)
+  }
+
+  required_combinations <- unique(as.data.frame(grid[, c("node", "vc", "vcLabel")], drop = FALSE))
+  for (i in seq_len(nrow(required_combinations))) {
+    node_value <- required_combinations$node[i]
+    vc_value <- required_combinations$vc[i]
+
+    collocate_pool <- collocates_by_node[[node_value]]
+    if (is.null(collocate_pool) || length(collocate_pool) == 0) {
+      next
+    }
+
+    existing_idx <- result$node == node_value & result$vc == vc_value
+    existing_collocates <- unique(as.character(result$collocate[existing_idx]))
+    missing_collocates <- setdiff(unique(collocate_pool), existing_collocates)
+    missing_collocates <- missing_collocates[!is.na(missing_collocates) & nzchar(missing_collocates)]
+
+    if (length(missing_collocates) == 0) {
+      next
+    }
+
+    context_rows <- result[result$node == node_value & result$vc == vc_value, , drop = FALSE]
+    if (nrow(context_rows) == 0) {
+      context_rows <- result[result$node == node_value, , drop = FALSE]
+    }
+
+    left_size <- context_rows$leftContextSize[!is.na(context_rows$leftContextSize)][1]
+    if (is.na(left_size) || length(left_size) == 0) {
+      left_size <- result$leftContextSize[!is.na(result$leftContextSize)][1]
+    }
+    if (is.na(left_size) || length(left_size) == 0) {
+      left_size <- 5
+    }
+
+    right_size <- context_rows$rightContextSize[!is.na(context_rows$rightContextSize)][1]
+    if (is.na(right_size) || length(right_size) == 0) {
+      right_size <- result$rightContextSize[!is.na(result$rightContextSize)][1]
+    }
+    if (is.na(right_size) || length(right_size) == 0) {
+      right_size <- 5
+    }
+
+    within_span_value <- ""
+    if ("query" %in% names(context_rows)) {
+      query_candidate <- context_rows$query[!is.na(context_rows$query) & nzchar(context_rows$query)][1]
+      if (!is.na(query_candidate) && nzchar(query_candidate)) {
+        match_one <- regexec("^\\(*contains\\(<([^>]+)>,", query_candidate)
+        matches <- regmatches(query_candidate, match_one)
+        if (length(matches) >= 1 && length(matches[[1]]) >= 2) {
+          within_span_value <- matches[[1]][2]
+        }
+      }
+    }
+
+    new_rows <- collocationScoreQuery(
+      kco,
+      node = node_value,
+      collocate = missing_collocates,
+      vc = vc_value,
+      leftContextSize = left_size,
+      rightContextSize = right_size,
+      ignoreCollocateCase = ignoreCollocateCase,
+      withinSpan = within_span_value,
+      ...
+    )
+
+    if (nrow(new_rows) == 0) {
+      next
+    }
+
+    if (!is.null(required_combinations$vcLabel[i]) && !is.na(required_combinations$vcLabel[i]) && required_combinations$vcLabel[i] != "" && "label" %in% names(new_rows)) {
+      new_rows$label <- required_combinations$vcLabel[i]
+    }
+
+    result <- dplyr::bind_rows(result, new_rows)
+  }
+
+  result
+}
+
 add_multi_vc_comparisons <- function(result, missingScoreQuantile = 0.05) {
   label <- node <- collocate <- NULL
 
diff --git a/man/collocationAnalysis-KorAPConnection-method.Rd b/man/collocationAnalysis-KorAPConnection-method.Rd
index a912430..bc23235 100644
--- a/man/collocationAnalysis-KorAPConnection-method.Rd
+++ b/man/collocationAnalysis-KorAPConnection-method.Rd
@@ -27,6 +27,7 @@
   threshold = 2,
   localStopwords = c(),
   collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
+  queryMissingScores = FALSE,
   missingScoreQuantile = 0.05,
   vcLabel = NA_character_,
   ...
@@ -75,6 +76,8 @@
 
 \item{collocateFilterRegex}{allow only collocates matching the regular expression}
 
+\item{queryMissingScores}{if TRUE, attempt to retrieve corpus-based association scores for vc/collocate combinations that would otherwise be imputed, by re-querying the KorAP backend without applying the collocate frequency threshold}
+
 \item{missingScoreQuantile}{lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora; a robust spread is subtracted from this anchor so the imputed values stay below the weakest observed scores}
 
 \item{vcLabel}{optional label override for the current virtual corpus (used internally when named VC collections are expanded)}