CA: add percentile ranks and documentation
Change-Id: I3a0c6aab970db5f4685b03164b20e0489e03799f
diff --git a/NAMESPACE b/NAMESPACE
index beb1dea..dddeb69 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -64,6 +64,7 @@
importFrom(curl,has_internet)
importFrom(dplyr,.data)
importFrom(dplyr,across)
+importFrom(dplyr,all_of)
importFrom(dplyr,anti_join)
importFrom(dplyr,arrange)
importFrom(dplyr,as_tibble)
@@ -77,6 +78,7 @@
importFrom(dplyr,first)
importFrom(dplyr,group_by)
importFrom(dplyr,if_else)
+importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,n)
importFrom(dplyr,quo_name)
@@ -112,6 +114,7 @@
importFrom(magrittr,debug_pipe)
importFrom(purrr,map)
importFrom(purrr,pmap)
+importFrom(rlang,sym)
importFrom(stats,median)
importFrom(stats,prop.test)
importFrom(stringr,str_detect)
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index 67a25cb..b7c5be6 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -46,8 +46,20 @@
#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
#' @param ... more arguments will be passed to [collocationScoreQuery()]
#' @inheritParams collocationScoreQuery,KorAPConnection-method
-#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
-#'
+#' @return
+#' A tibble where each row represents a candidate collocate for the requested node.
+#' Columns include (depending on the selected association measures):
+#'
+#' \itemize{
+#' \item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
+#' \item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
+#' \item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
+#' \item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
+#' \item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
+#' \item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
+#' \item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
+#' \item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
+#' }
#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
#' @importFrom purrr pmap
#' @importFrom tidyr expand_grid pivot_wider
@@ -407,13 +419,18 @@
rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
for (col in score_cols) {
rank_col_name <- paste0("rank_", safe_lab, "_", col)
+ percentile_col_name <- paste0("percentile_rank_", safe_lab, "_", col)
values <- label_df[[col]]
ranks <- rep(NA_real_, length(values))
+ percentiles <- rep(NA_real_, length(values))
valid_idx <- which(!is.na(values))
if (length(valid_idx) > 0) {
ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
+ total <- length(valid_idx)
+ percentiles[valid_idx] <- 1 - (ranks[valid_idx] - 1) / total
}
rank_tbl[[rank_col_name]] <- ranks
+ rank_tbl[[percentile_col_name]] <- percentiles
}
rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
}
@@ -437,6 +454,15 @@
)
}
+ percentile_replacements <- numeric(0)
+ percentile_column_names <- grep("^percentile_rank_", names(comparison), value = TRUE)
+ if (length(percentile_column_names) > 0) {
+ percentile_replacements <- stats::setNames(
+ rep(0, length(percentile_column_names)),
+ percentile_column_names
+ )
+ }
+
collapse_label_values <- function(indices, safe_labels_vec) {
if (length(indices) == 0) {
return(NA_character_)
@@ -475,6 +501,24 @@
list(x = x, y = y)
}
+ fill_percentiles <- function(x, y, left_pct_col, right_pct_col) {
+ replacement_left <- percentile_replacements[[left_pct_col]]
+ if (is.null(replacement_left) || !is.finite(replacement_left)) {
+ replacement_left <- 0
+ }
+ replacement_right <- percentile_replacements[[right_pct_col]]
+ if (is.null(replacement_right) || !is.finite(replacement_right)) {
+ replacement_right <- 0
+ }
+ if (any(is.na(x))) {
+ x[is.na(x)] <- replacement_left
+ }
+ if (any(is.na(y))) {
+ y[is.na(y)] <- replacement_right
+ }
+ list(x = x, y = y)
+ }
+
fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
fallback <- nrow(comparison) + 1
replacement_left <- rank_replacements[[left_rank_col]]
@@ -518,6 +562,17 @@
)
comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
}
+ pct_left <- paste0("percentile_rank_", left_label, "_", col)
+ pct_right <- paste0("percentile_rank_", right_label, "_", col)
+ if (all(c(pct_left, pct_right) %in% names(comparison))) {
+ filled_pct <- fill_percentiles(
+ comparison[[pct_left]],
+ comparison[[pct_right]],
+ pct_left,
+ pct_right
+ )
+ comparison[[paste0("delta_percentile_rank_", col)]] <- filled_pct$x - filled_pct$y
+ }
}
}
@@ -712,6 +767,93 @@
comparison[[max_delta_rank_col]] <- max_deltas
}
+ for (col in score_cols) {
+ pct_cols <- paste0("percentile_rank_", labels, "_", col)
+ existing <- pct_cols %in% names(comparison)
+ if (!any(existing)) {
+ next
+ }
+ pct_cols <- pct_cols[existing]
+ safe_labels <- labels[existing]
+ pct_values <- comparison[, pct_cols, drop = FALSE]
+
+ winner_pct_label_col <- paste0("winner_percentile_rank_", col)
+ winner_pct_value_col <- paste0("winner_percentile_rank_", col, "_value")
+ runner_pct_label_col <- paste0("runner_up_percentile_rank_", col)
+ runner_pct_value_col <- paste0("runner_up_percentile_rank_", col, "_value")
+ loser_pct_label_col <- paste0("loser_percentile_rank_", col)
+ loser_pct_value_col <- paste0("loser_percentile_rank_", col, "_value")
+ max_delta_pct_col <- paste0("max_delta_percentile_rank_", col)
+
+ if (nrow(pct_values) == 0) {
+ comparison[[winner_pct_label_col]] <- character(0)
+ comparison[[winner_pct_value_col]] <- numeric(0)
+ comparison[[runner_pct_label_col]] <- character(0)
+ comparison[[runner_pct_value_col]] <- numeric(0)
+ comparison[[loser_pct_label_col]] <- character(0)
+ comparison[[loser_pct_value_col]] <- numeric(0)
+ comparison[[max_delta_pct_col]] <- numeric(0)
+ next
+ }
+
+ pct_matrix <- as.matrix(pct_values)
+ storage.mode(pct_matrix) <- "numeric"
+
+ n_rows <- nrow(pct_matrix)
+ winner_labels <- rep(NA_character_, n_rows)
+ winner_values <- rep(NA_real_, n_rows)
+ runner_labels <- rep(NA_character_, n_rows)
+ runner_values <- rep(NA_real_, n_rows)
+ loser_labels <- rep(NA_character_, n_rows)
+ loser_values <- rep(NA_real_, n_rows)
+ max_deltas <- rep(NA_real_, n_rows)
+
+ if (n_rows > 0) {
+ for (i in seq_len(n_rows)) {
+ numeric_row <- as.numeric(pct_matrix[i, ])
+ if (all(is.na(numeric_row))) {
+ next
+ }
+
+ if (any(is.na(numeric_row))) {
+ numeric_row[is.na(numeric_row)] <- 0
+ }
+ pct_matrix[i, ] <- numeric_row
+
+ max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
+ max_idx <- which(numeric_row == max_val)
+ winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
+ winner_values[i] <- max_val
+
+ unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
+ if (length(unique_vals) >= 2) {
+ runner_val <- unique_vals[2]
+ runner_idx <- which(numeric_row == runner_val)
+ runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
+ runner_values[i] <- runner_val
+ }
+
+ min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
+ min_idx <- which(numeric_row == min_val)
+ loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
+ loser_values[i] <- min_val
+
+ if (is.finite(max_val) && is.finite(min_val)) {
+ max_deltas[i] <- max_val - min_val
+ }
+ }
+ }
+
+ comparison[, pct_cols] <- pct_matrix
+ comparison[[winner_pct_label_col]] <- winner_labels
+ comparison[[winner_pct_value_col]] <- winner_values
+ comparison[[runner_pct_label_col]] <- runner_labels
+ comparison[[runner_pct_value_col]] <- runner_values
+ comparison[[loser_pct_label_col]] <- loser_labels
+ comparison[[loser_pct_value_col]] <- loser_values
+ comparison[[max_delta_pct_col]] <- max_deltas
+ }
+
dplyr::left_join(result, comparison, by = c("node", "collocate"))
}
@@ -923,7 +1065,9 @@
"Eine",
"diese",
"Diese",
- "oder"
+ "oder",
+ "Es",
+ "Und"
)
return(res)
}
diff --git a/man/collocationAnalysis-KorAPConnection-method.Rd b/man/collocationAnalysis-KorAPConnection-method.Rd
index 6b13db7..9c97275 100644
--- a/man/collocationAnalysis-KorAPConnection-method.Rd
+++ b/man/collocationAnalysis-KorAPConnection-method.Rd
@@ -27,6 +27,8 @@
threshold = 2,
localStopwords = c(),
collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
+ missingScoreQuantile = 0.05,
+ vcLabel = NA_character_,
...
)
}
@@ -73,10 +75,26 @@
\item{collocateFilterRegex}{allow only collocates matching the regular expression}
+\item{missingScoreQuantile}{lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora}
+
+\item{vcLabel}{optional label override for the current virtual corpus (used internally when named VC collections are expanded)}
+
\item{...}{more arguments will be passed to \code{\link[=collocationScoreQuery]{collocationScoreQuery()}}}
}
\value{
-Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
+A tibble where each row represents a candidate collocate for the requested node.
+Columns include (depending on the selected association measures):
+
+\itemize{
+\item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
+\item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
+\item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
+\item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
+\item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
+\item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
+\item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
+\item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
+}
}
\description{
Performs a collocation analysis for the given node (or query)
diff --git a/tests/testthat/test-collocations.R b/tests/testthat/test-collocations.R
index 0705a2b..f79cf50 100644
--- a/tests/testthat/test-collocations.R
+++ b/tests/testthat/test-collocations.R
@@ -136,6 +136,13 @@
"loser_rank_logDice",
"loser_rank_logDice_value",
"max_delta_rank_logDice",
+ "winner_percentile_rank_logDice",
+ "winner_percentile_rank_logDice_value",
+ "runner_up_percentile_rank_logDice",
+ "runner_up_percentile_rank_logDice_value",
+ "loser_percentile_rank_logDice",
+ "loser_percentile_rank_logDice_value",
+ "max_delta_percentile_rank_logDice",
"winner_rank_pmi",
"winner_rank_pmi_value",
"runner_up_rank_pmi",
@@ -143,17 +150,33 @@
"loser_rank_pmi",
"loser_rank_pmi_value",
"max_delta_rank_pmi",
+ "winner_percentile_rank_pmi",
+ "winner_percentile_rank_pmi_value",
+ "runner_up_percentile_rank_pmi",
+ "runner_up_percentile_rank_pmi_value",
+ "loser_percentile_rank_pmi",
+ "loser_percentile_rank_pmi_value",
+ "max_delta_percentile_rank_pmi",
"rank_A_logDice",
"rank_B_logDice",
"rank_A_pmi",
"rank_B_pmi",
+ "percentile_rank_A_logDice",
+ "percentile_rank_B_logDice",
+ "percentile_rank_A_pmi",
+ "percentile_rank_B_pmi",
"delta_rank_logDice",
- "delta_rank_pmi"
+ "delta_rank_pmi",
+ "delta_percentile_rank_logDice",
+ "delta_percentile_rank_pmi"
) %in% colnames(enriched)))
expect_true(all(enriched$winner_logDice == "B"))
expect_true(all(enriched$runner_up_logDice == "A"))
expect_true(all(enriched$winner_logDice_value >= enriched$runner_up_logDice_value))
+ expect_true(all(enriched$percentile_rank_A_logDice == 1))
+ expect_true(all(enriched$percentile_rank_B_logDice == 1))
+ expect_true(all(enriched$delta_percentile_rank_logDice == 0))
})
test_that("add_multi_vc_comparisons handles more than two labels", {