CA: add percentile ranks and documentation Change-Id: I3a0c6aab970db5f4685b03164b20e0489e03799f

commit: 130a2a29eb3b593878dc2ac410e8a2b716d64e01 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat Oct 18 16:09:23 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sun Oct 19 15:02:21 2025 +0200
tree: 3a50e5d226336ac30a7011ceec44a0e74e543919
parent: 9894a37ab8a660bca79e94c5480da5428e357e4f [diff]
diff --git a/NAMESPACE b/NAMESPACE
index beb1dea..dddeb69 100644
--- a/NAMESPACE
+++ b/NAMESPACE

@@ -64,6 +64,7 @@
 importFrom(curl,has_internet)
 importFrom(dplyr,.data)
 importFrom(dplyr,across)
+importFrom(dplyr,all_of)
 importFrom(dplyr,anti_join)
 importFrom(dplyr,arrange)
 importFrom(dplyr,as_tibble)
@@ -77,6 +78,7 @@
 importFrom(dplyr,first)
 importFrom(dplyr,group_by)
 importFrom(dplyr,if_else)
+importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,n)
 importFrom(dplyr,quo_name)
@@ -112,6 +114,7 @@
 importFrom(magrittr,debug_pipe)
 importFrom(purrr,map)
 importFrom(purrr,pmap)
+importFrom(rlang,sym)
 importFrom(stats,median)
 importFrom(stats,prop.test)
 importFrom(stringr,str_detect)

diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index 67a25cb..b7c5be6 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R

@@ -46,8 +46,20 @@
 #' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
 #' @param ...                    more arguments will be passed to [collocationScoreQuery()]
 #' @inheritParams collocationScoreQuery,KorAPConnection-method
-#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
-#'
+#' @return
+#' A tibble where each row represents a candidate collocate for the requested node.
+#' Columns include (depending on the selected association measures):
+#' 
+#' \itemize{
+#'   \item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
+#'   \item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
+#'   \item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
+#'   \item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
+#'   \item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
+#'   \item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
+#'   \item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
+#'   \item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
+#' }
 #' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
 #' @importFrom purrr pmap
 #' @importFrom tidyr expand_grid pivot_wider
@@ -407,13 +419,18 @@
     rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
     for (col in score_cols) {
       rank_col_name <- paste0("rank_", safe_lab, "_", col)
+      percentile_col_name <- paste0("percentile_rank_", safe_lab, "_", col)
       values <- label_df[[col]]
       ranks <- rep(NA_real_, length(values))
+      percentiles <- rep(NA_real_, length(values))
       valid_idx <- which(!is.na(values))
       if (length(valid_idx) > 0) {
         ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
+        total <- length(valid_idx)
+        percentiles[valid_idx] <- 1 - (ranks[valid_idx] - 1) / total
       }
       rank_tbl[[rank_col_name]] <- ranks
+      rank_tbl[[percentile_col_name]] <- percentiles
     }
     rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
   }
@@ -437,6 +454,15 @@
     )
   }
 
+  percentile_replacements <- numeric(0)
+  percentile_column_names <- grep("^percentile_rank_", names(comparison), value = TRUE)
+  if (length(percentile_column_names) > 0) {
+    percentile_replacements <- stats::setNames(
+      rep(0, length(percentile_column_names)),
+      percentile_column_names
+    )
+  }
+
   collapse_label_values <- function(indices, safe_labels_vec) {
     if (length(indices) == 0) {
       return(NA_character_)
@@ -475,6 +501,24 @@
       list(x = x, y = y)
     }
 
+    fill_percentiles <- function(x, y, left_pct_col, right_pct_col) {
+      replacement_left <- percentile_replacements[[left_pct_col]]
+      if (is.null(replacement_left) || !is.finite(replacement_left)) {
+        replacement_left <- 0
+      }
+      replacement_right <- percentile_replacements[[right_pct_col]]
+      if (is.null(replacement_right) || !is.finite(replacement_right)) {
+        replacement_right <- 0
+      }
+      if (any(is.na(x))) {
+        x[is.na(x)] <- replacement_left
+      }
+      if (any(is.na(y))) {
+        y[is.na(y)] <- replacement_right
+      }
+      list(x = x, y = y)
+    }
+
     fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
       fallback <- nrow(comparison) + 1
       replacement_left <- rank_replacements[[left_rank_col]]
@@ -518,6 +562,17 @@
         )
         comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
       }
+      pct_left <- paste0("percentile_rank_", left_label, "_", col)
+      pct_right <- paste0("percentile_rank_", right_label, "_", col)
+      if (all(c(pct_left, pct_right) %in% names(comparison))) {
+        filled_pct <- fill_percentiles(
+          comparison[[pct_left]],
+          comparison[[pct_right]],
+          pct_left,
+          pct_right
+        )
+        comparison[[paste0("delta_percentile_rank_", col)]] <- filled_pct$x - filled_pct$y
+      }
     }
   }
 
@@ -712,6 +767,93 @@
     comparison[[max_delta_rank_col]] <- max_deltas
   }
 
+  for (col in score_cols) {
+    pct_cols <- paste0("percentile_rank_", labels, "_", col)
+    existing <- pct_cols %in% names(comparison)
+    if (!any(existing)) {
+      next
+    }
+    pct_cols <- pct_cols[existing]
+    safe_labels <- labels[existing]
+    pct_values <- comparison[, pct_cols, drop = FALSE]
+
+    winner_pct_label_col <- paste0("winner_percentile_rank_", col)
+    winner_pct_value_col <- paste0("winner_percentile_rank_", col, "_value")
+    runner_pct_label_col <- paste0("runner_up_percentile_rank_", col)
+    runner_pct_value_col <- paste0("runner_up_percentile_rank_", col, "_value")
+    loser_pct_label_col <- paste0("loser_percentile_rank_", col)
+    loser_pct_value_col <- paste0("loser_percentile_rank_", col, "_value")
+    max_delta_pct_col <- paste0("max_delta_percentile_rank_", col)
+
+    if (nrow(pct_values) == 0) {
+      comparison[[winner_pct_label_col]] <- character(0)
+      comparison[[winner_pct_value_col]] <- numeric(0)
+      comparison[[runner_pct_label_col]] <- character(0)
+      comparison[[runner_pct_value_col]] <- numeric(0)
+      comparison[[loser_pct_label_col]] <- character(0)
+      comparison[[loser_pct_value_col]] <- numeric(0)
+      comparison[[max_delta_pct_col]] <- numeric(0)
+      next
+    }
+
+    pct_matrix <- as.matrix(pct_values)
+    storage.mode(pct_matrix) <- "numeric"
+
+    n_rows <- nrow(pct_matrix)
+    winner_labels <- rep(NA_character_, n_rows)
+    winner_values <- rep(NA_real_, n_rows)
+    runner_labels <- rep(NA_character_, n_rows)
+    runner_values <- rep(NA_real_, n_rows)
+    loser_labels <- rep(NA_character_, n_rows)
+    loser_values <- rep(NA_real_, n_rows)
+    max_deltas <- rep(NA_real_, n_rows)
+
+    if (n_rows > 0) {
+      for (i in seq_len(n_rows)) {
+        numeric_row <- as.numeric(pct_matrix[i, ])
+        if (all(is.na(numeric_row))) {
+          next
+        }
+
+        if (any(is.na(numeric_row))) {
+          numeric_row[is.na(numeric_row)] <- 0
+        }
+        pct_matrix[i, ] <- numeric_row
+
+        max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
+        max_idx <- which(numeric_row == max_val)
+        winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
+        winner_values[i] <- max_val
+
+        unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
+        if (length(unique_vals) >= 2) {
+          runner_val <- unique_vals[2]
+          runner_idx <- which(numeric_row == runner_val)
+          runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
+          runner_values[i] <- runner_val
+        }
+
+        min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
+        min_idx <- which(numeric_row == min_val)
+        loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
+        loser_values[i] <- min_val
+
+        if (is.finite(max_val) && is.finite(min_val)) {
+          max_deltas[i] <- max_val - min_val
+        }
+      }
+    }
+
+    comparison[, pct_cols] <- pct_matrix
+    comparison[[winner_pct_label_col]] <- winner_labels
+    comparison[[winner_pct_value_col]] <- winner_values
+    comparison[[runner_pct_label_col]] <- runner_labels
+    comparison[[runner_pct_value_col]] <- runner_values
+    comparison[[loser_pct_label_col]] <- loser_labels
+    comparison[[loser_pct_value_col]] <- loser_values
+    comparison[[max_delta_pct_col]] <- max_deltas
+  }
+
   dplyr::left_join(result, comparison, by = c("node", "collocate"))
 }
 
@@ -923,7 +1065,9 @@
     "Eine",
     "diese",
     "Diese",
-    "oder"
+    "oder",
+    "Es",
+    "Und"
   )
   return(res)
 }

diff --git a/man/collocationAnalysis-KorAPConnection-method.Rd b/man/collocationAnalysis-KorAPConnection-method.Rd
index 6b13db7..9c97275 100644
--- a/man/collocationAnalysis-KorAPConnection-method.Rd
+++ b/man/collocationAnalysis-KorAPConnection-method.Rd

@@ -27,6 +27,8 @@
   threshold = 2,
   localStopwords = c(),
   collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
+  missingScoreQuantile = 0.05,
+  vcLabel = NA_character_,
   ...
 )
 }
@@ -73,10 +75,26 @@
 
 \item{collocateFilterRegex}{allow only collocates matching the regular expression}
 
+\item{missingScoreQuantile}{lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora}
+
+\item{vcLabel}{optional label override for the current virtual corpus (used internally when named VC collections are expanded)}
+
 \item{...}{more arguments will be passed to \code{\link[=collocationScoreQuery]{collocationScoreQuery()}}}
 }
 \value{
-Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
+A tibble where each row represents a candidate collocate for the requested node.
+Columns include (depending on the selected association measures):
+
+\itemize{
+\item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
+\item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
+\item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
+\item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
+\item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
+\item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
+\item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
+\item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
+}
 }
 \description{
 Performs a collocation analysis for the given node (or query)

diff --git a/tests/testthat/test-collocations.R b/tests/testthat/test-collocations.R
index 0705a2b..f79cf50 100644
--- a/tests/testthat/test-collocations.R
+++ b/tests/testthat/test-collocations.R

@@ -136,6 +136,13 @@
     "loser_rank_logDice",
     "loser_rank_logDice_value",
     "max_delta_rank_logDice",
+    "winner_percentile_rank_logDice",
+    "winner_percentile_rank_logDice_value",
+    "runner_up_percentile_rank_logDice",
+    "runner_up_percentile_rank_logDice_value",
+    "loser_percentile_rank_logDice",
+    "loser_percentile_rank_logDice_value",
+    "max_delta_percentile_rank_logDice",
     "winner_rank_pmi",
     "winner_rank_pmi_value",
     "runner_up_rank_pmi",
@@ -143,17 +150,33 @@
     "loser_rank_pmi",
     "loser_rank_pmi_value",
     "max_delta_rank_pmi",
+    "winner_percentile_rank_pmi",
+    "winner_percentile_rank_pmi_value",
+    "runner_up_percentile_rank_pmi",
+    "runner_up_percentile_rank_pmi_value",
+    "loser_percentile_rank_pmi",
+    "loser_percentile_rank_pmi_value",
+    "max_delta_percentile_rank_pmi",
     "rank_A_logDice",
     "rank_B_logDice",
     "rank_A_pmi",
     "rank_B_pmi",
+    "percentile_rank_A_logDice",
+    "percentile_rank_B_logDice",
+    "percentile_rank_A_pmi",
+    "percentile_rank_B_pmi",
     "delta_rank_logDice",
-    "delta_rank_pmi"
+    "delta_rank_pmi",
+    "delta_percentile_rank_logDice",
+    "delta_percentile_rank_pmi"
   ) %in% colnames(enriched)))
 
   expect_true(all(enriched$winner_logDice == "B"))
   expect_true(all(enriched$runner_up_logDice == "A"))
   expect_true(all(enriched$winner_logDice_value >= enriched$runner_up_logDice_value))
+  expect_true(all(enriched$percentile_rank_A_logDice == 1))
+  expect_true(all(enriched$percentile_rank_B_logDice == 1))
+  expect_true(all(enriched$delta_percentile_rank_logDice == 0))
 })
 
 test_that("add_multi_vc_comparisons handles more than two labels", {
commit	130a2a29eb3b593878dc2ac410e8a2b716d64e01	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Oct 18 16:09:23 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Oct 19 15:02:21 2025 +0200
tree	3a50e5d226336ac30a7011ceec44a0e74e543919
parent	9894a37ab8a660bca79e94c5480da5428e357e4f [diff]