CA: add winner columns
Change-Id: Ia6ec4821d08352f2044e8a7101880d75419d995d
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index 4f58396..56d7923 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -306,6 +306,8 @@
non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
score_cols <- setdiff(numeric_cols, non_score_cols)
+ score_cols <- setdiff(score_cols, "rankWithinLabel")
+
if (length(score_cols) == 0) {
return(result)
}
@@ -331,7 +333,9 @@
values_fn = dplyr::first
)
- labels <- make.names(unique(result$label))
+ raw_labels <- unique(result$label)
+ labels <- make.names(raw_labels)
+ label_map <- stats::setNames(raw_labels, labels)
if (length(labels) == 2) {
fill_scores <- function(x, y) {
@@ -364,6 +368,8 @@
next
}
filled <- fill_scores(comparison[[left_col]], comparison[[right_col]])
+ comparison[[left_col]] <- filled$x
+ comparison[[right_col]] <- filled$y
comparison[[paste0("delta_", col)]] <- filled$x - filled$y
}
@@ -371,10 +377,86 @@
right_rank <- paste0("rankWithinLabel_", right_label)
if (all(c(left_rank, right_rank) %in% names(comparison))) {
filled_rank <- fill_ranks(comparison[[left_rank]], comparison[[right_rank]])
+ comparison[[left_rank]] <- filled_rank$x
+ comparison[[right_rank]] <- filled_rank$y
comparison[["delta_rankWithinLabel"]] <- filled_rank$x - filled_rank$y
}
}
+ for (col in score_cols) {
+ value_cols <- paste0(col, "_", labels)
+ existing <- value_cols %in% names(comparison)
+ if (!any(existing)) {
+ next
+ }
+ value_cols <- value_cols[existing]
+ safe_labels <- labels[existing]
+
+ score_values <- comparison[, value_cols, drop = FALSE]
+
+ winner_label_col <- paste0("winner_", col)
+ winner_value_col <- paste0("winner_", col, "_value")
+ runner_label_col <- paste0("runner_up_", col)
+ runner_value_col <- paste0("runner_up_", col, "_value")
+
+ if (nrow(score_values) == 0) {
+ comparison[[winner_label_col]] <- character(0)
+ comparison[[winner_value_col]] <- numeric(0)
+ comparison[[runner_label_col]] <- character(0)
+ comparison[[runner_value_col]] <- numeric(0)
+ next
+ }
+
+ score_matrix <- as.matrix(score_values)
+
+ winner_labels <- apply(score_matrix, 1, function(row) {
+ row <- as.numeric(row)
+ valid <- which(!is.na(row))
+ if (length(valid) == 0) {
+ return(NA_character_)
+ }
+ ord <- valid[order(row[valid], decreasing = TRUE)]
+ unname(label_map[safe_labels[ord[1]]])
+ })
+ winner_labels <- unname(as.character(winner_labels))
+
+ winner_values <- apply(score_matrix, 1, function(row) {
+ row <- as.numeric(row)
+ if (all(is.na(row))) {
+ return(NA_real_)
+ }
+ max(row, na.rm = TRUE)
+ })
+ winner_values <- unname(as.numeric(winner_values))
+
+ runner_labels <- apply(score_matrix, 1, function(row) {
+ row <- as.numeric(row)
+ valid <- which(!is.na(row))
+ if (length(valid) < 2) {
+ return(NA_character_)
+ }
+ ord <- valid[order(row[valid], decreasing = TRUE)]
+ unname(label_map[safe_labels[ord[2]]])
+ })
+ runner_labels <- unname(as.character(runner_labels))
+
+ runner_values <- apply(score_matrix, 1, function(row) {
+ row <- as.numeric(row)
+ valid <- which(!is.na(row))
+ if (length(valid) < 2) {
+ return(NA_real_)
+ }
+ ord <- valid[order(row[valid], decreasing = TRUE)]
+ row[ord[2]]
+ })
+ runner_values <- unname(as.numeric(runner_values))
+
+ comparison[[winner_label_col]] <- winner_labels
+ comparison[[winner_value_col]] <- winner_values
+ comparison[[runner_label_col]] <- runner_labels
+ comparison[[runner_value_col]] <- runner_values
+ }
+
dplyr::left_join(result, comparison, by = c("node", "collocate"))
}
diff --git a/tests/testthat/test-collocations.R b/tests/testthat/test-collocations.R
index 448e061..bd86cdf 100644
--- a/tests/testthat/test-collocations.R
+++ b/tests/testthat/test-collocations.R
@@ -102,6 +102,39 @@
expect_equal(merged$query, "Anspruch focus(in [tt/p=NN] {[tt/l=nehmen]}) | focus({[tt/l=nehmen] in} [tt/p=NN]) Anspruch")
})
+test_that("add_multi_vc_comparisons adds favorite columns", {
+ sample_result <- tibble::tibble(
+ node = c("n", "n"),
+ collocate = c("c", "c"),
+ vc = c("vc1", "vc2"),
+ label = c("A", "B"),
+ N = c(100, 100),
+ O = c(10, 20),
+ O1 = c(50, 50),
+ O2 = c(30, 30),
+ E = c(5, 5),
+ w = c(2, 2),
+ leftContextSize = c(1, 1),
+ rightContextSize = c(1, 1),
+ frequency = c(10, 20),
+ logDice = c(5, 7),
+ pmi = c(2, 3)
+ )
+
+ enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result, "logDice", 0.9)
+
+ expect_true(all(c(
+ "winner_logDice",
+ "winner_logDice_value",
+ "runner_up_logDice",
+ "runner_up_logDice_value"
+ ) %in% colnames(enriched)))
+
+ expect_true(all(enriched$winner_logDice == "B"))
+ expect_true(all(enriched$runner_up_logDice == "A"))
+ expect_true(all(enriched$winner_logDice_value >= enriched$runner_up_logDice_value))
+})
+
# New tests for improved coverage of collocationAnalysis.R helper functions
test_that("synsemanticStopwords returns German stopwords", {