CA: add percentile ranks and documentation Change-Id: I3a0c6aab970db5f4685b03164b20e0489e03799f

commit: 130a2a29eb3b593878dc2ac410e8a2b716d64e01 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat Oct 18 16:09:23 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sun Oct 19 15:02:21 2025 +0200
tree: 3a50e5d226336ac30a7011ceec44a0e74e543919
parent: 9894a37ab8a660bca79e94c5480da5428e357e4f [diff] [blame]
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index 67a25cb..b7c5be6 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R

@@ -46,8 +46,20 @@
 #' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
 #' @param ...                    more arguments will be passed to [collocationScoreQuery()]
 #' @inheritParams collocationScoreQuery,KorAPConnection-method
-#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
-#'
+#' @return
+#' A tibble where each row represents a candidate collocate for the requested node.
+#' Columns include (depending on the selected association measures):
+#' 
+#' \itemize{
+#'   \item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
+#'   \item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
+#'   \item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
+#'   \item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
+#'   \item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
+#'   \item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
+#'   \item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
+#'   \item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
+#' }
 #' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
 #' @importFrom purrr pmap
 #' @importFrom tidyr expand_grid pivot_wider
@@ -407,13 +419,18 @@
     rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
     for (col in score_cols) {
       rank_col_name <- paste0("rank_", safe_lab, "_", col)
+      percentile_col_name <- paste0("percentile_rank_", safe_lab, "_", col)
       values <- label_df[[col]]
       ranks <- rep(NA_real_, length(values))
+      percentiles <- rep(NA_real_, length(values))
       valid_idx <- which(!is.na(values))
       if (length(valid_idx) > 0) {
         ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
+        total <- length(valid_idx)
+        percentiles[valid_idx] <- 1 - (ranks[valid_idx] - 1) / total
       }
       rank_tbl[[rank_col_name]] <- ranks
+      rank_tbl[[percentile_col_name]] <- percentiles
     }
     rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
   }
@@ -437,6 +454,15 @@
     )
   }
 
+  percentile_replacements <- numeric(0)
+  percentile_column_names <- grep("^percentile_rank_", names(comparison), value = TRUE)
+  if (length(percentile_column_names) > 0) {
+    percentile_replacements <- stats::setNames(
+      rep(0, length(percentile_column_names)),
+      percentile_column_names
+    )
+  }
+
   collapse_label_values <- function(indices, safe_labels_vec) {
     if (length(indices) == 0) {
       return(NA_character_)
@@ -475,6 +501,24 @@
       list(x = x, y = y)
     }
 
+    fill_percentiles <- function(x, y, left_pct_col, right_pct_col) {
+      replacement_left <- percentile_replacements[[left_pct_col]]
+      if (is.null(replacement_left) || !is.finite(replacement_left)) {
+        replacement_left <- 0
+      }
+      replacement_right <- percentile_replacements[[right_pct_col]]
+      if (is.null(replacement_right) || !is.finite(replacement_right)) {
+        replacement_right <- 0
+      }
+      if (any(is.na(x))) {
+        x[is.na(x)] <- replacement_left
+      }
+      if (any(is.na(y))) {
+        y[is.na(y)] <- replacement_right
+      }
+      list(x = x, y = y)
+    }
+
     fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
       fallback <- nrow(comparison) + 1
       replacement_left <- rank_replacements[[left_rank_col]]
@@ -518,6 +562,17 @@
         )
         comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
       }
+      pct_left <- paste0("percentile_rank_", left_label, "_", col)
+      pct_right <- paste0("percentile_rank_", right_label, "_", col)
+      if (all(c(pct_left, pct_right) %in% names(comparison))) {
+        filled_pct <- fill_percentiles(
+          comparison[[pct_left]],
+          comparison[[pct_right]],
+          pct_left,
+          pct_right
+        )
+        comparison[[paste0("delta_percentile_rank_", col)]] <- filled_pct$x - filled_pct$y
+      }
     }
   }
 
@@ -712,6 +767,93 @@
     comparison[[max_delta_rank_col]] <- max_deltas
   }
 
+  for (col in score_cols) {
+    pct_cols <- paste0("percentile_rank_", labels, "_", col)
+    existing <- pct_cols %in% names(comparison)
+    if (!any(existing)) {
+      next
+    }
+    pct_cols <- pct_cols[existing]
+    safe_labels <- labels[existing]
+    pct_values <- comparison[, pct_cols, drop = FALSE]
+
+    winner_pct_label_col <- paste0("winner_percentile_rank_", col)
+    winner_pct_value_col <- paste0("winner_percentile_rank_", col, "_value")
+    runner_pct_label_col <- paste0("runner_up_percentile_rank_", col)
+    runner_pct_value_col <- paste0("runner_up_percentile_rank_", col, "_value")
+    loser_pct_label_col <- paste0("loser_percentile_rank_", col)
+    loser_pct_value_col <- paste0("loser_percentile_rank_", col, "_value")
+    max_delta_pct_col <- paste0("max_delta_percentile_rank_", col)
+
+    if (nrow(pct_values) == 0) {
+      comparison[[winner_pct_label_col]] <- character(0)
+      comparison[[winner_pct_value_col]] <- numeric(0)
+      comparison[[runner_pct_label_col]] <- character(0)
+      comparison[[runner_pct_value_col]] <- numeric(0)
+      comparison[[loser_pct_label_col]] <- character(0)
+      comparison[[loser_pct_value_col]] <- numeric(0)
+      comparison[[max_delta_pct_col]] <- numeric(0)
+      next
+    }
+
+    pct_matrix <- as.matrix(pct_values)
+    storage.mode(pct_matrix) <- "numeric"
+
+    n_rows <- nrow(pct_matrix)
+    winner_labels <- rep(NA_character_, n_rows)
+    winner_values <- rep(NA_real_, n_rows)
+    runner_labels <- rep(NA_character_, n_rows)
+    runner_values <- rep(NA_real_, n_rows)
+    loser_labels <- rep(NA_character_, n_rows)
+    loser_values <- rep(NA_real_, n_rows)
+    max_deltas <- rep(NA_real_, n_rows)
+
+    if (n_rows > 0) {
+      for (i in seq_len(n_rows)) {
+        numeric_row <- as.numeric(pct_matrix[i, ])
+        if (all(is.na(numeric_row))) {
+          next
+        }
+
+        if (any(is.na(numeric_row))) {
+          numeric_row[is.na(numeric_row)] <- 0
+        }
+        pct_matrix[i, ] <- numeric_row
+
+        max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
+        max_idx <- which(numeric_row == max_val)
+        winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
+        winner_values[i] <- max_val
+
+        unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
+        if (length(unique_vals) >= 2) {
+          runner_val <- unique_vals[2]
+          runner_idx <- which(numeric_row == runner_val)
+          runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
+          runner_values[i] <- runner_val
+        }
+
+        min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
+        min_idx <- which(numeric_row == min_val)
+        loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
+        loser_values[i] <- min_val
+
+        if (is.finite(max_val) && is.finite(min_val)) {
+          max_deltas[i] <- max_val - min_val
+        }
+      }
+    }
+
+    comparison[, pct_cols] <- pct_matrix
+    comparison[[winner_pct_label_col]] <- winner_labels
+    comparison[[winner_pct_value_col]] <- winner_values
+    comparison[[runner_pct_label_col]] <- runner_labels
+    comparison[[runner_pct_value_col]] <- runner_values
+    comparison[[loser_pct_label_col]] <- loser_labels
+    comparison[[loser_pct_value_col]] <- loser_values
+    comparison[[max_delta_pct_col]] <- max_deltas
+  }
+
   dplyr::left_join(result, comparison, by = c("node", "collocate"))
 }
 
@@ -923,7 +1065,9 @@
     "Eine",
     "diese",
     "Diese",
-    "oder"
+    "oder",
+    "Es",
+    "Und"
   )
   return(res)
 }
commit	130a2a29eb3b593878dc2ac410e8a2b716d64e01	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Oct 18 16:09:23 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Oct 19 15:02:21 2025 +0200
tree	3a50e5d226336ac30a7011ceec44a0e74e543919
parent	9894a37ab8a660bca79e94c5480da5428e357e4f [diff] [blame]