CA: add measure based ranks
Change-Id: I84dacfe9f61e9b65268b241bc8adf82c4cc4adc3
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index bf4318f..9446a4b 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -302,7 +302,7 @@
}
add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreFactor) {
- label <- node <- collocate <- rankWithinLabel <- NULL
+ label <- node <- collocate <- NULL
if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
return(result)
@@ -312,29 +312,15 @@
non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
score_cols <- setdiff(numeric_cols, non_score_cols)
- score_cols <- setdiff(score_cols, "rankWithinLabel")
-
if (length(score_cols) == 0) {
return(result)
}
- ranking_col <- thresholdScore
- if (is.null(ranking_col) || is.na(ranking_col) || !ranking_col %in% score_cols) {
- ranking_col <- if ("logDice" %in% score_cols) "logDice" else score_cols[1]
- }
-
- ranking_sym <- rlang::sym(ranking_col)
-
- result <- result |>
- dplyr::group_by(label) |>
- dplyr::mutate(rankWithinLabel = dplyr::row_number(dplyr::desc(!!ranking_sym))) |>
- dplyr::ungroup()
-
comparison <- result |>
- dplyr::select(node, collocate, label, rankWithinLabel, dplyr::all_of(score_cols)) |>
- pivot_wider(
+ dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) |>
+ tidyr::pivot_wider(
names_from = label,
- values_from = c(rankWithinLabel, dplyr::all_of(score_cols)),
+ values_from = dplyr::all_of(score_cols),
names_glue = "{.value}_{make.names(label)}",
values_fn = dplyr::first
)
@@ -343,6 +329,64 @@
labels <- make.names(raw_labels)
label_map <- stats::setNames(raw_labels, labels)
+ rank_data <- result |>
+ dplyr::distinct(node, collocate)
+
+ for (i in seq_along(raw_labels)) {
+ raw_lab <- raw_labels[i]
+ safe_lab <- labels[i]
+ label_df <- result[result$label == raw_lab, c("node", "collocate", score_cols), drop = FALSE]
+ if (nrow(label_df) == 0) {
+ next
+ }
+ label_df <- dplyr::distinct(label_df)
+ rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
+ for (col in score_cols) {
+ rank_col_name <- paste0("rank_", safe_lab, "_", col)
+ values <- label_df[[col]]
+ ranks <- rep(NA_real_, length(values))
+ valid_idx <- which(!is.na(values))
+ if (length(valid_idx) > 0) {
+ ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
+ }
+ rank_tbl[[rank_col_name]] <- ranks
+ }
+ rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
+ }
+
+ comparison <- dplyr::left_join(comparison, rank_data, by = c("node", "collocate"))
+
+ rank_replacements <- numeric(0)
+ rank_column_names <- grep("^rank_", names(comparison), value = TRUE)
+ if (length(rank_column_names) > 0) {
+ rank_replacements <- stats::setNames(
+ vapply(rank_column_names, function(col) {
+ col_values <- comparison[[col]]
+ valid_values <- col_values[!is.na(col_values)]
+ if (length(valid_values) == 0) {
+ nrow(comparison) + 1
+ } else {
+ suppressWarnings(max(valid_values, na.rm = TRUE)) + 1
+ }
+ }, numeric(1)),
+ rank_column_names
+ )
+ }
+
+ collapse_label_values <- function(indices, safe_labels_vec) {
+ if (length(indices) == 0) {
+ return(NA_character_)
+ }
+ labs <- label_map[safe_labels_vec[indices]]
+ fallback <- safe_labels_vec[indices]
+ labs[is.na(labs) | labs == ""] <- fallback[is.na(labs) | labs == ""]
+ labs <- labs[!is.na(labs) & labs != ""]
+ if (length(labs) == 0) {
+ return(NA_character_)
+ }
+ paste(unique(labs), collapse = ", ")
+ }
+
if (length(labels) == 2) {
fill_scores <- function(x, y) {
min_val <- suppressWarnings(min(c(x, y), na.rm = TRUE))
@@ -354,13 +398,22 @@
list(x = x, y = y)
}
- fill_ranks <- function(x, y) {
- max_val <- suppressWarnings(max(c(x, y), na.rm = TRUE))
- if (!is.finite(max_val)) {
- max_val <- 0
+ fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
+ fallback <- nrow(comparison) + 1
+ replacement_left <- rank_replacements[[left_rank_col]]
+ if (is.null(replacement_left) || !is.finite(replacement_left)) {
+ replacement_left <- fallback
}
- x[is.na(x)] <- max_val + 1
- y[is.na(y)] <- max_val + 1
+ replacement_right <- rank_replacements[[right_rank_col]]
+ if (is.null(replacement_right) || !is.finite(replacement_right)) {
+ replacement_right <- fallback
+ }
+ if (any(is.na(x))) {
+ x[is.na(x)] <- replacement_left
+ }
+ if (any(is.na(y))) {
+ y[is.na(y)] <- replacement_right
+ }
list(x = x, y = y)
}
@@ -377,15 +430,17 @@
comparison[[left_col]] <- filled$x
comparison[[right_col]] <- filled$y
comparison[[paste0("delta_", col)]] <- filled$x - filled$y
- }
-
- left_rank <- paste0("rankWithinLabel_", left_label)
- right_rank <- paste0("rankWithinLabel_", right_label)
- if (all(c(left_rank, right_rank) %in% names(comparison))) {
- filled_rank <- fill_ranks(comparison[[left_rank]], comparison[[right_rank]])
- comparison[[left_rank]] <- filled_rank$x
- comparison[[right_rank]] <- filled_rank$y
- comparison[["delta_rankWithinLabel"]] <- filled_rank$x - filled_rank$y
+ rank_left <- paste0("rank_", left_label, "_", col)
+ rank_right <- paste0("rank_", right_label, "_", col)
+ if (all(c(rank_left, rank_right) %in% names(comparison))) {
+ filled_rank <- fill_ranks(
+ comparison[[rank_left]],
+ comparison[[rank_right]],
+ rank_left,
+ rank_right
+ )
+ comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
+ }
}
}
@@ -431,20 +486,6 @@
loser_values <- rep(NA_real_, n_rows)
max_deltas <- rep(NA_real_, n_rows)
- collapse_labels <- function(indices) {
- if (length(indices) == 0) {
- return(NA_character_)
- }
- labs <- label_map[safe_labels[indices]]
- fallback <- safe_labels[indices]
- labs[is.na(labs) | labs == ""] <- fallback[is.na(labs) | labs == ""]
- labs <- labs[!is.na(labs) & labs != ""]
- if (length(labs) == 0) {
- return(NA_character_)
- }
- paste(unique(labs), collapse = ", ")
- }
-
if (n_rows > 0) {
for (i in seq_len(n_rows)) {
numeric_row <- as.numeric(score_matrix[i, ])
@@ -461,20 +502,20 @@
max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
max_idx <- which(numeric_row == max_val)
- winner_labels[i] <- collapse_labels(max_idx)
+ winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
winner_values[i] <- max_val
unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
if (length(unique_vals) >= 2) {
runner_val <- unique_vals[2]
runner_idx <- which(numeric_row == runner_val)
- runner_labels[i] <- collapse_labels(runner_idx)
+ runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
runner_values[i] <- runner_val
}
min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
min_idx <- which(numeric_row == min_val)
- loser_labels[i] <- collapse_labels(min_idx)
+ loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
loser_values[i] <- min_val
if (is.finite(max_val) && is.finite(min_val)) {
@@ -493,6 +534,96 @@
comparison[[max_delta_col]] <- max_deltas
}
+ for (col in score_cols) {
+ rank_cols <- paste0("rank_", labels, "_", col)
+ existing <- rank_cols %in% names(comparison)
+ if (!any(existing)) {
+ next
+ }
+ rank_cols <- rank_cols[existing]
+ safe_labels <- labels[existing]
+ rank_values <- comparison[, rank_cols, drop = FALSE]
+
+ winner_rank_label_col <- paste0("winner_rank_", col)
+ winner_rank_value_col <- paste0("winner_rank_", col, "_value")
+ runner_rank_label_col <- paste0("runner_up_rank_", col)
+ runner_rank_value_col <- paste0("runner_up_rank_", col, "_value")
+ loser_rank_label_col <- paste0("loser_rank_", col)
+ loser_rank_value_col <- paste0("loser_rank_", col, "_value")
+ max_delta_rank_col <- paste0("max_delta_rank_", col)
+
+ if (nrow(rank_values) == 0) {
+ comparison[[winner_rank_label_col]] <- character(0)
+ comparison[[winner_rank_value_col]] <- numeric(0)
+ comparison[[runner_rank_label_col]] <- character(0)
+ comparison[[runner_rank_value_col]] <- numeric(0)
+ comparison[[loser_rank_label_col]] <- character(0)
+ comparison[[loser_rank_value_col]] <- numeric(0)
+ comparison[[max_delta_rank_col]] <- numeric(0)
+ next
+ }
+
+ rank_matrix <- as.matrix(rank_values)
+ storage.mode(rank_matrix) <- "numeric"
+
+ n_rows <- nrow(rank_matrix)
+ winner_labels <- rep(NA_character_, n_rows)
+ winner_values <- rep(NA_real_, n_rows)
+ runner_labels <- rep(NA_character_, n_rows)
+ runner_values <- rep(NA_real_, n_rows)
+ loser_labels <- rep(NA_character_, n_rows)
+ loser_values <- rep(NA_real_, n_rows)
+ max_deltas <- rep(NA_real_, n_rows)
+
+ for (i in seq_len(n_rows)) {
+ numeric_row <- as.numeric(rank_matrix[i, ])
+ if (all(is.na(numeric_row))) {
+ next
+ }
+
+ if (length(rank_cols) > 0) {
+ replacement_vec <- rank_replacements[rank_cols]
+ replacement_vec[is.na(replacement_vec)] <- nrow(comparison) + 1
+ missing_idx <- which(is.na(numeric_row))
+ if (length(missing_idx) > 0) {
+ numeric_row[missing_idx] <- replacement_vec[missing_idx]
+ }
+ }
+
+ valid_idx <- seq_along(numeric_row)
+ valid_values <- numeric_row[valid_idx]
+ min_val <- suppressWarnings(min(valid_values, na.rm = TRUE))
+ min_positions <- valid_idx[which(valid_values == min_val)]
+ winner_labels[i] <- collapse_label_values(min_positions, safe_labels)
+ winner_values[i] <- min_val
+
+ ordered_vals <- sort(unique(valid_values), decreasing = FALSE)
+ if (length(ordered_vals) >= 2) {
+ runner_val <- ordered_vals[2]
+ runner_positions <- valid_idx[which(valid_values == runner_val)]
+ runner_labels[i] <- collapse_label_values(runner_positions, safe_labels)
+ runner_values[i] <- runner_val
+ }
+
+ max_val <- suppressWarnings(max(valid_values, na.rm = TRUE))
+ max_positions <- valid_idx[which(valid_values == max_val)]
+ loser_labels[i] <- collapse_label_values(max_positions, safe_labels)
+ loser_values[i] <- max_val
+
+ if (is.finite(max_val) && is.finite(min_val)) {
+ max_deltas[i] <- max_val - min_val
+ }
+ }
+
+ comparison[[winner_rank_label_col]] <- winner_labels
+ comparison[[winner_rank_value_col]] <- winner_values
+ comparison[[runner_rank_label_col]] <- runner_labels
+ comparison[[runner_rank_value_col]] <- runner_values
+ comparison[[loser_rank_label_col]] <- loser_labels
+ comparison[[loser_rank_value_col]] <- loser_values
+ comparison[[max_delta_rank_col]] <- max_deltas
+ }
+
dplyr::left_join(result, comparison, by = c("node", "collocate"))
}
diff --git a/tests/testthat/test-collocations.R b/tests/testthat/test-collocations.R
index 914558b..52a2a66 100644
--- a/tests/testthat/test-collocations.R
+++ b/tests/testthat/test-collocations.R
@@ -127,7 +127,28 @@
"winner_logDice",
"winner_logDice_value",
"runner_up_logDice",
- "runner_up_logDice_value"
+ "runner_up_logDice_value",
+ "max_delta_logDice",
+ "winner_rank_logDice",
+ "winner_rank_logDice_value",
+ "runner_up_rank_logDice",
+ "runner_up_rank_logDice_value",
+ "loser_rank_logDice",
+ "loser_rank_logDice_value",
+ "max_delta_rank_logDice",
+ "winner_rank_pmi",
+ "winner_rank_pmi_value",
+ "runner_up_rank_pmi",
+ "runner_up_rank_pmi_value",
+ "loser_rank_pmi",
+ "loser_rank_pmi_value",
+ "max_delta_rank_pmi",
+ "rank_A_logDice",
+ "rank_B_logDice",
+ "rank_A_pmi",
+ "rank_B_pmi",
+ "delta_rank_logDice",
+ "delta_rank_pmi"
) %in% colnames(enriched)))
expect_true(all(enriched$winner_logDice == "B"))
@@ -164,6 +185,82 @@
expect_equal(enriched$max_delta_logDice[1], 4)
})
+test_that("add_multi_vc_comparisons computes rank deltas", {
+ base_tbl <- tidyr::expand_grid(
+ label = c("A", "B", "C"),
+ collocate = c("c1", "c2", "c3")
+ ) |>
+ dplyr::mutate(
+ node = "n",
+ vc = paste0("vc", label),
+ N = 100,
+ O = 10,
+ O1 = 50,
+ O2 = 40,
+ E = 5,
+ w = 2,
+ leftContextSize = 1,
+ rightContextSize = 1,
+ frequency = 10,
+ logDice = dplyr::case_when(
+ label == "A" & collocate == "c1" ~ 9,
+ label == "A" & collocate == "c2" ~ 6,
+ label == "A" & collocate == "c3" ~ 3,
+ label == "B" & collocate == "c1" ~ 7,
+ label == "B" & collocate == "c2" ~ 9,
+ label == "B" & collocate == "c3" ~ 5,
+ label == "C" & collocate == "c1" ~ 4,
+ label == "C" & collocate == "c2" ~ 6,
+ label == "C" & collocate == "c3" ~ 8,
+ TRUE ~ 0
+ )
+ )
+
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(base_tbl, "logDice", 0.9)
+ target_row <- enriched |>
+ dplyr::filter(collocate == "c1") |>
+ dplyr::slice_head(n = 1)
+
+ expect_equal(target_row$rank_A_logDice, 1)
+ expect_equal(target_row$rank_B_logDice, 2)
+ expect_equal(target_row$rank_C_logDice, 3)
+ expect_equal(target_row$winner_rank_logDice, "A")
+ expect_equal(target_row$winner_rank_logDice_value, 1)
+ expect_equal(target_row$runner_up_rank_logDice, "B")
+ expect_equal(target_row$runner_up_rank_logDice_value, 2)
+ expect_equal(target_row$loser_rank_logDice, "C")
+ expect_equal(target_row$loser_rank_logDice_value, 3)
+ expect_equal(target_row$max_delta_rank_logDice, 2)
+})
+
+test_that("add_multi_vc_comparisons imputes missing ranks for max delta", {
+ sample_result <- tibble::tibble(
+ node = c("n", "n"),
+ collocate = c("c", "c"),
+ vc = c("vc1", "vc2"),
+ label = c("A", "B"),
+ N = c(100, 100),
+ O = c(10, 10),
+ O1 = c(50, 50),
+ O2 = c(30, 30),
+ E = c(5, 5),
+ w = c(2, 2),
+ leftContextSize = c(1, 1),
+ rightContextSize = c(1, 1),
+ frequency = c(10, 10),
+ logDice = c(5, NA)
+ )
+
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice", 0.9)
+
+ expect_equal(enriched$rank_A_logDice[1], 1)
+ expect_true(is.na(enriched$rank_B_logDice[1]))
+ expect_equal(enriched$winner_rank_logDice[1], "A")
+ expect_equal(enriched$loser_rank_logDice[1], "B")
+ expect_equal(enriched$loser_rank_logDice_value[1], 2)
+ expect_equal(enriched$max_delta_rank_logDice[1], 1)
+})
+
# New tests for improved coverage of collocationAnalysis.R helper functions
test_that("synsemanticStopwords returns German stopwords", {