| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 1 | test_that("collocationScoreQuery works", { |
| Marc Kupietz | 83d0af3 | 2022-02-24 12:49:28 +0100 | [diff] [blame] | 2 | skip_if_offline() |
| Marc Kupietz | 617266d | 2025-02-27 10:43:07 +0100 | [diff] [blame] | 3 | kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = TRUE) |
| Marc Kupietz | 7de5f32 | 2025-06-04 17:17:22 +0200 | [diff] [blame] | 4 | df <- collocationScoreQuery(kco, "Ameisenplage", "heimgesucht", leftContextSize = 0, rightContextSize = 1) |
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 5 | expect_gt(df$logDice, 1) |
| 6 | expect_equal(df$ll, ll(df$O1, df$O2, df$O, df$N, df$E, df$w)) |
| 7 | expect_equal(df$pmi, pmi(df$O1, df$O2, df$O, df$N, df$E, df$w)) |
| 8 | expect_equal(df$mi2, mi2(df$O1, df$O2, df$O, df$N, df$E, df$w)) |
| 9 | expect_equal(df$mi3, mi3(df$O1, df$O2, df$O, df$N, df$E, df$w)) |
| 10 | expect_equal(df$logDice, logDice(df$O1, df$O2, df$O, df$N, df$E, df$w)) |
| 11 | }) |
| 12 | |
| Marc Kupietz | 581a29b | 2021-09-04 20:51:04 +0200 | [diff] [blame] | 13 | |
| 14 | test_that("collocationAnalysis works and warns about missing token", { |
| Marc Kupietz | 83d0af3 | 2022-02-24 12:49:28 +0100 | [diff] [blame] | 15 | skip_if_offline() |
| Marc Kupietz | 617266d | 2025-02-27 10:43:07 +0100 | [diff] [blame] | 16 | kco <- KorAPConnection( |
| Marc Kupietz | 7de5f32 | 2025-06-04 17:17:22 +0200 | [diff] [blame] | 17 | accessToken = NULL, |
| 18 | verbose = TRUE |
| 19 | ) |
| 20 | expect_warning( |
| 21 | df <- |
| 22 | collocationAnalysis( |
| 23 | kco, |
| 24 | "focus([tt/p=ADJA] {Newstickeritis})", |
| 25 | leftContextSize = 1, |
| 26 | rightContextSize = 0, |
| 27 | ), |
| 28 | "access token" |
| 29 | ) |
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 30 | expect_gt(df$O, df$E) |
| Marc Kupietz | f912959 | 2025-01-26 19:17:54 +0100 | [diff] [blame] | 31 | expect_gt(df$logDice, -1) |
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 32 | }) |
| 33 | |
| 34 | test_that("collocationAnalysis on unaccounted strings does not error out", { |
| Marc Kupietz | 83d0af3 | 2022-02-24 12:49:28 +0100 | [diff] [blame] | 35 | skip_if_offline() |
| Marc Kupietz | 617266d | 2025-02-27 10:43:07 +0100 | [diff] [blame] | 36 | kco <- KorAPConnection(accessToken = NULL, verbose = TRUE) |
| Marc Kupietz | 581a29b | 2021-09-04 20:51:04 +0200 | [diff] [blame] | 37 | expect_warning( |
| Marc Kupietz | 7de5f32 | 2025-06-04 17:17:22 +0200 | [diff] [blame] | 38 | df <- collocationAnalysis(kco, "XXXXXXXXAmeisenplage", vc = c("corpusSigle=/WDD17/", "corpusSigle=/WUD17/"), maxRecurse = 2), |
| Marc Kupietz | 581a29b | 2021-09-04 20:51:04 +0200 | [diff] [blame] | 39 | "access token" |
| 40 | ) |
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 41 | testthat::expect_equal(nrow(df), 0) |
| 42 | }) |
| Marc Kupietz | d6314b6 | 2021-12-22 12:49:09 +0100 | [diff] [blame] | 43 | |
| Marc Kupietz | 7de5f32 | 2025-06-04 17:17:22 +0200 | [diff] [blame] | 44 | # test_that("removeWithinSpanWorks", { |
| Marc Kupietz | 76dee31 | 2025-04-06 16:24:47 +0200 | [diff] [blame] | 45 | # expect_equal( |
| 46 | # removeWithinSpan("contains(<base/s=s>, (machen []{0,1} aufmerksam | aufmerksam []{0,1} machen))", "base/s=s"), |
| 47 | # "(machen []{0,1} aufmerksam | aufmerksam []{0,1} machen)") |
| Marc Kupietz | 7de5f32 | 2025-06-04 17:17:22 +0200 | [diff] [blame] | 48 | # }) |
| Marc Kupietz | dbdbb1f | 2025-02-19 10:33:06 +0100 | [diff] [blame] | 49 | |
| 50 | |
| 51 | test_that("mergeDuplicateCollocatesWorksAsExpected", { |
| Marc Kupietz | 5057f50 | 2025-04-06 16:55:57 +0200 | [diff] [blame] | 52 | ldf <- tibble::tibble( |
| Marc Kupietz | dbdbb1f | 2025-02-19 10:33:06 +0100 | [diff] [blame] | 53 | node = c("focus(in [tt/p=NN] {[tt/l=nehmen]})"), |
| 54 | collocate = c("Anspruch"), |
| 55 | label = c(""), |
| 56 | vc = c(""), |
| 57 | query = c("Anspruch focus(in [tt/p=NN] {[tt/l=nehmen]})"), |
| 58 | webUIRequestUrl = c( |
| 59 | "https://korap.ids-mannheim.de/?q=Anspruch%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dnehmen%5d%7d%29&ql=poliqarp" |
| 60 | ), |
| 61 | w = c(1), |
| 62 | leftContextSize = c(1), |
| 63 | rightContextSize = c(0), |
| 64 | N = c(23578528381.5), |
| 65 | O = c(0.5), |
| 66 | O1 = c(1168410.5), |
| 67 | O2 = c(1296870.5), |
| 68 | E = c(64.2651265093014), |
| 69 | pmi = c(11.9173498777957), |
| 70 | mi2 = c(29.8406639214616), |
| 71 | mi3 = c(47.7639779651274), |
| 72 | logDice = c(11.6899933757298), |
| 73 | ll = c(3717716.74208791) |
| 74 | ) |
| Marc Kupietz | 5057f50 | 2025-04-06 16:55:57 +0200 | [diff] [blame] | 75 | rdf <- tibble::tibble( |
| Marc Kupietz | dbdbb1f | 2025-02-19 10:33:06 +0100 | [diff] [blame] | 76 | node = c("focus({[tt/l=nehmen] in} [tt/p=NN])"), |
| 77 | collocate = c("Anspruch"), |
| 78 | label = c(""), |
| 79 | vc = c(""), |
| 80 | query = c("focus({[tt/l=nehmen] in} [tt/p=NN]) Anspruch"), |
| 81 | webUIRequestUrl = c( |
| 82 | "https://korap.ids-mannheim.de/?q=focus%28%7b%5btt%2fl%3dnehmen%5d%20in%7d%20%5btt%2fp%3dNN%5d%29%20Anspruch&ql=poliqarp" |
| 83 | ), |
| 84 | w = c(1), |
| 85 | leftContextSize = c(0), |
| 86 | rightContextSize = c(1), |
| 87 | N = c(23578528381.5), |
| 88 | O = c(0.5), |
| 89 | O1 = c(17077.5), |
| 90 | O2 = c(1296870.5), |
| 91 | E = c(0.939299756346416), |
| 92 | pmi = c(7.99469408391783), |
| 93 | mi2 = c(15.8990457079122), |
| 94 | mi3 = c(23.8033973319065), |
| 95 | logDice = c(2.57887487309409), |
| 96 | ll = c(2181.35986032019) |
| 97 | ) |
| 98 | merged <- mergeDuplicateCollocates(ldf, rdf, smoothingConstant = 0.5) |
| 99 | expect_equal(merged$O, 0.5) |
| 100 | expect_equal(merged$O1, 1185487.5) |
| 101 | expect_equal(merged$O2, 1296870.5) |
| 102 | expect_equal(merged$query, "Anspruch focus(in [tt/p=NN] {[tt/l=nehmen]}) | focus({[tt/l=nehmen] in} [tt/p=NN]) Anspruch") |
| 103 | }) |
| Marc Kupietz | 7de5f32 | 2025-06-04 17:17:22 +0200 | [diff] [blame] | 104 | |
| Marc Kupietz | 5e35d7a | 2025-10-17 21:21:22 +0200 | [diff] [blame] | 105 | test_that("add_multi_vc_comparisons adds favorite columns", { |
| 106 | sample_result <- tibble::tibble( |
| 107 | node = c("n", "n"), |
| 108 | collocate = c("c", "c"), |
| 109 | vc = c("vc1", "vc2"), |
| 110 | label = c("A", "B"), |
| 111 | N = c(100, 100), |
| 112 | O = c(10, 20), |
| 113 | O1 = c(50, 50), |
| 114 | O2 = c(30, 30), |
| 115 | E = c(5, 5), |
| 116 | w = c(2, 2), |
| 117 | leftContextSize = c(1, 1), |
| 118 | rightContextSize = c(1, 1), |
| 119 | frequency = c(10, 20), |
| 120 | logDice = c(5, 7), |
| 121 | pmi = c(2, 3) |
| 122 | ) |
| 123 | |
| Marc Kupietz | 77852b2 | 2025-10-19 11:35:34 +0200 | [diff] [blame] | 124 | enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result) |
| Marc Kupietz | 5e35d7a | 2025-10-17 21:21:22 +0200 | [diff] [blame] | 125 | |
| 126 | expect_true(all(c( |
| 127 | "winner_logDice", |
| 128 | "winner_logDice_value", |
| 129 | "runner_up_logDice", |
| Marc Kupietz | 28a2984 | 2025-10-18 12:25:09 +0200 | [diff] [blame] | 130 | "runner_up_logDice_value", |
| 131 | "max_delta_logDice", |
| 132 | "winner_rank_logDice", |
| 133 | "winner_rank_logDice_value", |
| 134 | "runner_up_rank_logDice", |
| 135 | "runner_up_rank_logDice_value", |
| 136 | "loser_rank_logDice", |
| 137 | "loser_rank_logDice_value", |
| 138 | "max_delta_rank_logDice", |
| Marc Kupietz | 130a2a2 | 2025-10-18 16:09:23 +0200 | [diff] [blame] | 139 | "winner_percentile_rank_logDice", |
| 140 | "winner_percentile_rank_logDice_value", |
| 141 | "runner_up_percentile_rank_logDice", |
| 142 | "runner_up_percentile_rank_logDice_value", |
| 143 | "loser_percentile_rank_logDice", |
| 144 | "loser_percentile_rank_logDice_value", |
| 145 | "max_delta_percentile_rank_logDice", |
| Marc Kupietz | 28a2984 | 2025-10-18 12:25:09 +0200 | [diff] [blame] | 146 | "winner_rank_pmi", |
| 147 | "winner_rank_pmi_value", |
| 148 | "runner_up_rank_pmi", |
| 149 | "runner_up_rank_pmi_value", |
| 150 | "loser_rank_pmi", |
| 151 | "loser_rank_pmi_value", |
| 152 | "max_delta_rank_pmi", |
| Marc Kupietz | 130a2a2 | 2025-10-18 16:09:23 +0200 | [diff] [blame] | 153 | "winner_percentile_rank_pmi", |
| 154 | "winner_percentile_rank_pmi_value", |
| 155 | "runner_up_percentile_rank_pmi", |
| 156 | "runner_up_percentile_rank_pmi_value", |
| 157 | "loser_percentile_rank_pmi", |
| 158 | "loser_percentile_rank_pmi_value", |
| 159 | "max_delta_percentile_rank_pmi", |
| Marc Kupietz | 28a2984 | 2025-10-18 12:25:09 +0200 | [diff] [blame] | 160 | "rank_A_logDice", |
| 161 | "rank_B_logDice", |
| 162 | "rank_A_pmi", |
| 163 | "rank_B_pmi", |
| Marc Kupietz | 130a2a2 | 2025-10-18 16:09:23 +0200 | [diff] [blame] | 164 | "percentile_rank_A_logDice", |
| 165 | "percentile_rank_B_logDice", |
| 166 | "percentile_rank_A_pmi", |
| 167 | "percentile_rank_B_pmi", |
| Marc Kupietz | 28a2984 | 2025-10-18 12:25:09 +0200 | [diff] [blame] | 168 | "delta_rank_logDice", |
| Marc Kupietz | 130a2a2 | 2025-10-18 16:09:23 +0200 | [diff] [blame] | 169 | "delta_rank_pmi", |
| 170 | "delta_percentile_rank_logDice", |
| 171 | "delta_percentile_rank_pmi" |
| Marc Kupietz | 5e35d7a | 2025-10-17 21:21:22 +0200 | [diff] [blame] | 172 | ) %in% colnames(enriched))) |
| 173 | |
| 174 | expect_true(all(enriched$winner_logDice == "B")) |
| 175 | expect_true(all(enriched$runner_up_logDice == "A")) |
| 176 | expect_true(all(enriched$winner_logDice_value >= enriched$runner_up_logDice_value)) |
| Marc Kupietz | 130a2a2 | 2025-10-18 16:09:23 +0200 | [diff] [blame] | 177 | expect_true(all(enriched$percentile_rank_A_logDice == 1)) |
| 178 | expect_true(all(enriched$percentile_rank_B_logDice == 1)) |
| 179 | expect_true(all(enriched$delta_percentile_rank_logDice == 0)) |
| Marc Kupietz | 5e35d7a | 2025-10-17 21:21:22 +0200 | [diff] [blame] | 180 | }) |
| 181 | |
| Marc Kupietz | b2862d4 | 2025-10-18 10:17:49 +0200 | [diff] [blame] | 182 | test_that("add_multi_vc_comparisons handles more than two labels", { |
| 183 | sample_result <- tibble::tibble( |
| 184 | node = rep("n", 3), |
| 185 | collocate = rep("c", 3), |
| 186 | vc = c("vc1", "vc2", "vc3"), |
| 187 | label = c("A", "B", "C"), |
| 188 | N = rep(100, 3), |
| 189 | O = c(10, 30, 5), |
| 190 | O1 = rep(50, 3), |
| 191 | O2 = rep(30, 3), |
| 192 | E = rep(5, 3), |
| 193 | w = rep(2, 3), |
| 194 | leftContextSize = rep(1, 3), |
| 195 | rightContextSize = rep(1, 3), |
| 196 | frequency = c(10, 30, 5), |
| 197 | logDice = c(5, 8, 4), |
| 198 | pmi = c(2, 3, 1) |
| 199 | ) |
| 200 | |
| Marc Kupietz | 77852b2 | 2025-10-19 11:35:34 +0200 | [diff] [blame] | 201 | enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result) |
| Marc Kupietz | b2862d4 | 2025-10-18 10:17:49 +0200 | [diff] [blame] | 202 | expect_equal(enriched$winner_logDice[1], "B") |
| 203 | expect_equal(enriched$winner_logDice_value[1], 8) |
| 204 | expect_equal(enriched$runner_up_logDice[1], "A") |
| 205 | expect_equal(enriched$runner_up_logDice_value[1], 5) |
| 206 | expect_equal(enriched$loser_logDice[1], "C") |
| 207 | expect_equal(enriched$loser_logDice_value[1], 4) |
| 208 | expect_equal(enriched$max_delta_logDice[1], 4) |
| 209 | }) |
| 210 | |
| Marc Kupietz | 28a2984 | 2025-10-18 12:25:09 +0200 | [diff] [blame] | 211 | test_that("add_multi_vc_comparisons computes rank deltas", { |
| 212 | base_tbl <- tidyr::expand_grid( |
| 213 | label = c("A", "B", "C"), |
| 214 | collocate = c("c1", "c2", "c3") |
| 215 | ) |> |
| 216 | dplyr::mutate( |
| 217 | node = "n", |
| 218 | vc = paste0("vc", label), |
| 219 | N = 100, |
| 220 | O = 10, |
| 221 | O1 = 50, |
| 222 | O2 = 40, |
| 223 | E = 5, |
| 224 | w = 2, |
| 225 | leftContextSize = 1, |
| 226 | rightContextSize = 1, |
| 227 | frequency = 10, |
| 228 | logDice = dplyr::case_when( |
| 229 | label == "A" & collocate == "c1" ~ 9, |
| 230 | label == "A" & collocate == "c2" ~ 6, |
| 231 | label == "A" & collocate == "c3" ~ 3, |
| 232 | label == "B" & collocate == "c1" ~ 7, |
| 233 | label == "B" & collocate == "c2" ~ 9, |
| 234 | label == "B" & collocate == "c3" ~ 5, |
| 235 | label == "C" & collocate == "c1" ~ 4, |
| 236 | label == "C" & collocate == "c2" ~ 6, |
| 237 | label == "C" & collocate == "c3" ~ 8, |
| 238 | TRUE ~ 0 |
| 239 | ) |
| 240 | ) |
| 241 | |
| Marc Kupietz | 77852b2 | 2025-10-19 11:35:34 +0200 | [diff] [blame] | 242 | enriched <- RKorAPClient:::add_multi_vc_comparisons(base_tbl) |
| Marc Kupietz | 28a2984 | 2025-10-18 12:25:09 +0200 | [diff] [blame] | 243 | target_row <- enriched |> |
| 244 | dplyr::filter(collocate == "c1") |> |
| 245 | dplyr::slice_head(n = 1) |
| 246 | |
| 247 | expect_equal(target_row$rank_A_logDice, 1) |
| 248 | expect_equal(target_row$rank_B_logDice, 2) |
| 249 | expect_equal(target_row$rank_C_logDice, 3) |
| 250 | expect_equal(target_row$winner_rank_logDice, "A") |
| 251 | expect_equal(target_row$winner_rank_logDice_value, 1) |
| 252 | expect_equal(target_row$runner_up_rank_logDice, "B") |
| 253 | expect_equal(target_row$runner_up_rank_logDice_value, 2) |
| 254 | expect_equal(target_row$loser_rank_logDice, "C") |
| 255 | expect_equal(target_row$loser_rank_logDice_value, 3) |
| 256 | expect_equal(target_row$max_delta_rank_logDice, 2) |
| 257 | }) |
| 258 | |
| 259 | test_that("add_multi_vc_comparisons imputes missing ranks for max delta", { |
| 260 | sample_result <- tibble::tibble( |
| 261 | node = c("n", "n"), |
| 262 | collocate = c("c", "c"), |
| 263 | vc = c("vc1", "vc2"), |
| 264 | label = c("A", "B"), |
| 265 | N = c(100, 100), |
| 266 | O = c(10, 10), |
| 267 | O1 = c(50, 50), |
| 268 | O2 = c(30, 30), |
| 269 | E = c(5, 5), |
| 270 | w = c(2, 2), |
| 271 | leftContextSize = c(1, 1), |
| 272 | rightContextSize = c(1, 1), |
| 273 | frequency = c(10, 10), |
| 274 | logDice = c(5, NA) |
| 275 | ) |
| 276 | |
| Marc Kupietz | 77852b2 | 2025-10-19 11:35:34 +0200 | [diff] [blame] | 277 | enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result) |
| Marc Kupietz | 28a2984 | 2025-10-18 12:25:09 +0200 | [diff] [blame] | 278 | |
| 279 | expect_equal(enriched$rank_A_logDice[1], 1) |
| 280 | expect_true(is.na(enriched$rank_B_logDice[1])) |
| 281 | expect_equal(enriched$winner_rank_logDice[1], "A") |
| 282 | expect_equal(enriched$loser_rank_logDice[1], "B") |
| 283 | expect_equal(enriched$loser_rank_logDice_value[1], 2) |
| 284 | expect_equal(enriched$max_delta_rank_logDice[1], 1) |
| 285 | }) |
| 286 | |
| Marc Kupietz | 9894a37 | 2025-10-18 14:51:29 +0200 | [diff] [blame] | 287 | test_that("adaptive missing score imputation respects measure-specific scales", { |
| 288 | sample_result <- tibble::tibble( |
| 289 | node = c("n", "n", "n"), |
| 290 | collocate = c("c", "c", "c"), |
| 291 | vc = c("vc1", "vc2", "vc3"), |
| 292 | label = c("A", "B", "C"), |
| 293 | N = c(100, 100, 100), |
| 294 | O = c(12, 9, 7), |
| 295 | O1 = c(60, 40, 30), |
| 296 | O2 = c(33, 22, 18), |
| 297 | E = c(6, 6, 6), |
| 298 | w = c(2, 2, 2), |
| 299 | leftContextSize = c(1, 1, 1), |
| 300 | rightContextSize = c(1, 1, 1), |
| 301 | frequency = c(15, 11, 9), |
| 302 | logDice = c(-0.31, NA, -0.12), |
| 303 | pmi = c(-1.65, NA, -0.48), |
| 304 | ll = c(12.4, NA, 7.9) |
| 305 | ) |
| 306 | |
| 307 | enriched <- RKorAPClient:::add_multi_vc_comparisons( |
| 308 | sample_result, |
| Marc Kupietz | 9894a37 | 2025-10-18 14:51:29 +0200 | [diff] [blame] | 309 | missingScoreQuantile = 0.05 |
| 310 | ) |
| 311 | |
| 312 | row_a <- dplyr::filter(enriched, label == "A") |> dplyr::slice_head(n = 1) |
| 313 | |
| 314 | expect_false(is.na(row_a$logDice_B)) |
| 315 | expect_false(is.na(row_a$pmi_B)) |
| 316 | expect_false(is.na(row_a$ll_B)) |
| 317 | |
| 318 | expect_lt(row_a$logDice_B, min(sample_result$logDice, na.rm = TRUE)) |
| 319 | expect_lt(row_a$pmi_B, min(sample_result$pmi, na.rm = TRUE)) |
| 320 | expect_lte(row_a$ll_B, min(sample_result$ll, na.rm = TRUE)) |
| 321 | |
| 322 | expect_gt(row_a$max_delta_logDice, 0) |
| 323 | expect_gt(row_a$winner_logDice_value - row_a$loser_logDice_value, 0) |
| 324 | }) |
| 325 | |
| Marc Kupietz | 7de5f32 | 2025-06-04 17:17:22 +0200 | [diff] [blame] | 326 | # New tests for improved coverage of collocationAnalysis.R helper functions |
| 327 | |
| 328 | test_that("synsemanticStopwords returns German stopwords", { |
| 329 | stopwords <- synsemanticStopwords() |
| 330 | expect_true(is.character(stopwords)) |
| 331 | expect_true(length(stopwords) > 50) |
| 332 | expect_true("der" %in% stopwords) |
| 333 | expect_true("die" %in% stopwords) |
| 334 | expect_true("und" %in% stopwords) |
| 335 | expect_true("mit" %in% stopwords) |
| 336 | }) |
| 337 | |
| 338 | test_that("removeWithinSpan removes span constraints correctly", { |
| 339 | # Test basic span removal |
| 340 | query1 <- "contains(<base/s=s>, (machen []{0,1} aufmerksam | aufmerksam []{0,1} machen))" |
| 341 | result1 <- RKorAPClient:::removeWithinSpan(query1, "base/s=s") |
| 342 | expect_equal(result1, "(machen []{0,1} aufmerksam | aufmerksam []{0,1} machen)") |
| 343 | |
| 344 | # Test with different span |
| 345 | query2 <- "contains(<p/s=s>, (test query))" |
| 346 | result2 <- RKorAPClient:::removeWithinSpan(query2, "p/s=s") |
| 347 | expect_equal(result2, "(test query)") |
| 348 | |
| 349 | # Test with empty span - should return original query |
| 350 | query3 <- "simple query" |
| 351 | result3 <- RKorAPClient:::removeWithinSpan(query3, "") |
| 352 | expect_equal(result3, query3) |
| 353 | |
| 354 | # Test with non-matching span |
| 355 | query4 <- "contains(<base/s=s>, test)" |
| 356 | result4 <- RKorAPClient:::removeWithinSpan(query4, "other/span") |
| 357 | expect_equal(result4, query4) |
| 358 | }) |
| 359 | |
| 360 | test_that("matches2FreqTable handles empty matches", { |
| 361 | empty_matches <- data.frame() |
| 362 | result <- RKorAPClient:::matches2FreqTable(empty_matches, index = 0) |
| 363 | |
| 364 | expect_true(is.data.frame(result)) |
| 365 | expect_equal(nrow(result), 0) |
| 366 | }) |
| 367 | |
| 368 | test_that("matches2FreqTable processes single match correctly", { |
| 369 | # Create mock matches data |
| 370 | mock_matches <- data.frame( |
| 371 | tokens = I(list(list( |
| 372 | left = c("der", "große"), |
| 373 | match = "Test", |
| 374 | right = c("ist", "wichtig") |
| 375 | ))), |
| 376 | stringsAsFactors = FALSE |
| 377 | ) |
| 378 | |
| 379 | result <- RKorAPClient:::matches2FreqTable( |
| 380 | mock_matches, |
| 381 | index = 1, |
| 382 | leftContextSize = 2, |
| 383 | rightContextSize = 2, |
| 384 | stopwords = c("der", "ist") # Provide stopwords to avoid empty join |
| 385 | ) |
| 386 | |
| 387 | expect_true(is.data.frame(result)) |
| 388 | }) |
| 389 | |
| 390 | test_that("snippet2FreqTable handles empty snippet", { |
| 391 | result <- RKorAPClient:::snippet2FreqTable(character(0)) |
| 392 | |
| 393 | expect_true(is.data.frame(result)) |
| 394 | expect_equal(nrow(result), 0) |
| 395 | }) |
| 396 | |
| 397 | test_that("snippet2FreqTable processes single snippet correctly", { |
| 398 | snippet <- '<span class="context-left">der große </span><span class="match"><mark>Test</mark></span><span class="context-right"> ist wichtig</span>' |
| 399 | |
| 400 | result <- RKorAPClient:::snippet2FreqTable( |
| 401 | snippet, |
| 402 | leftContextSize = 2, |
| 403 | rightContextSize = 2, |
| 404 | stopwords = c("der"), # Provide stopwords to avoid empty join |
| 405 | verbose = FALSE |
| 406 | ) |
| 407 | |
| 408 | expect_true(is.data.frame(result)) |
| 409 | }) |
| 410 | |
| Marc Kupietz | 0a29263 | 2025-10-19 14:04:36 +0200 | [diff] [blame^] | 411 | test_that("inject_focus_into_query adds focus wrappers when span queries lack them", { |
| 412 | unfocused <- "contains(<base/s=s>, (Anspruch [tt/l=nehmen] | [tt/l=nehmen] Anspruch))" |
| 413 | focused <- RKorAPClient:::inject_focus_into_query(unfocused) |
| 414 | |
| 415 | expect_equal( |
| 416 | focused, |
| 417 | "contains(<base/s=s>, (focus({Anspruch [tt/l=nehmen]}) | focus({[tt/l=nehmen] Anspruch})))" |
| 418 | ) |
| 419 | }) |
| 420 | |
| 421 | test_that("inject_focus_into_query leaves existing focus segments unchanged", { |
| 422 | already_focused <- "contains(<base/s=s>, (focus({Anspruch [tt/l=nehmen]})))" |
| 423 | expect_identical( |
| 424 | RKorAPClient:::inject_focus_into_query(already_focused), |
| 425 | already_focused |
| 426 | ) |
| 427 | }) |
| 428 | |
| Marc Kupietz | 7de5f32 | 2025-06-04 17:17:22 +0200 | [diff] [blame] | 429 | # Removed hanging findExample tests as they cause infinite wait |
| 430 | # These tests make API calls that don't complete properly |
| 431 | |
| 432 | # Removed hanging collocatesQuery tests as they cause infinite wait |
| 433 | # These tests were causing the test suite to hang and not terminate |
| 434 | |
| 435 | test_that("collocationAnalysis handles exactFrequencies parameter", { |
| 436 | skip_if_offline() |
| 437 | kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE) |
| 438 | |
| 439 | expect_warning( |
| 440 | result <- collocationAnalysis( |
| 441 | kco, |
| 442 | "Test", |
| 443 | exactFrequencies = TRUE, |
| 444 | searchHitsSampleLimit = 5, |
| 445 | topCollocatesLimit = 5 |
| 446 | ), |
| 447 | "access token" |
| 448 | ) |
| 449 | expect_true(is.data.frame(result)) |
| 450 | }) |
| 451 | |
| 452 | test_that("collocationAnalysis handles withinSpan parameter", { |
| 453 | skip_if_offline() |
| 454 | kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE) |
| 455 | |
| 456 | expect_warning( |
| 457 | result <- collocationAnalysis( |
| 458 | kco, |
| 459 | "Test", |
| 460 | withinSpan = "base/s=s", |
| 461 | exactFrequencies = TRUE, |
| 462 | searchHitsSampleLimit = 5, |
| 463 | topCollocatesLimit = 5 |
| 464 | ), |
| 465 | "access token" |
| 466 | ) |
| 467 | expect_true(is.data.frame(result)) |
| 468 | }) |
| 469 | |
| 470 | test_that("collocationAnalysis handles expand parameter", { |
| 471 | skip_if_offline() |
| 472 | kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE) |
| 473 | |
| 474 | expect_warning( |
| 475 | result <- collocationAnalysis( |
| 476 | kco, |
| 477 | c("Test", "der"), |
| 478 | expand = TRUE, |
| 479 | searchHitsSampleLimit = 2, |
| 480 | topCollocatesLimit = 2 |
| 481 | ), |
| 482 | "access token" |
| 483 | ) |
| 484 | expect_true(is.data.frame(result)) |
| 485 | }) |
| 486 | |
| Marc Kupietz | e34a8be | 2025-10-17 20:13:42 +0200 | [diff] [blame] | 487 | test_that("collocationAnalysis honors named vc labels", { |
| 488 | skip_if_offline() |
| 489 | kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE) |
| 490 | |
| 491 | named_vc <- c( |
| 492 | Western = "textType=/.*Western.*/ & pubDate in 2012", |
| 493 | Erotic = "textType=/.*(Erotik|Gay).*/ & pubDate in 2012", |
| 494 | Historic = "textType=/.*Historisch.*/ & pubDate in 2012" |
| 495 | ) |
| 496 | |
| 497 | expect_warning( |
| 498 | result <- collocationAnalysis( |
| 499 | kco, |
| 500 | "[tt/l=treffen]", |
| 501 | vc = named_vc, |
| 502 | searchHitsSampleLimit = 2, |
| 503 | topCollocatesLimit = 2 |
| 504 | ), |
| 505 | "access token" |
| 506 | ) |
| 507 | |
| 508 | if (nrow(result) > 0) { |
| 509 | expect_true("label" %in% colnames(result)) |
| 510 | expect_setequal(unique(result$label), names(named_vc)) |
| 511 | } |
| 512 | }) |
| 513 | |
| Marc Kupietz | 7de5f32 | 2025-06-04 17:17:22 +0200 | [diff] [blame] | 514 | test_that("collocationAnalysis handles stopwords parameter", { |
| 515 | skip_if_offline() |
| 516 | kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE) |
| 517 | |
| 518 | expect_warning( |
| 519 | result <- collocationAnalysis( |
| 520 | kco, |
| 521 | "Test", |
| 522 | stopwords = c("der", "die", "und"), |
| 523 | searchHitsSampleLimit = 5, |
| 524 | topCollocatesLimit = 5 |
| 525 | ), |
| 526 | "access token" |
| 527 | ) |
| 528 | expect_true(is.data.frame(result)) |
| 529 | }) |
| 530 | |
| 531 | test_that("collocationAnalysis handles lemmatizeNodeQuery parameter", { |
| 532 | skip_if_offline() |
| 533 | kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE) |
| 534 | |
| 535 | expect_warning( |
| 536 | result <- collocationAnalysis( |
| 537 | kco, |
| 538 | "laufen", |
| 539 | lemmatizeNodeQuery = TRUE, |
| 540 | searchHitsSampleLimit = 5, |
| 541 | topCollocatesLimit = 5 |
| 542 | ), |
| 543 | "access token" |
| 544 | ) |
| 545 | expect_true(is.data.frame(result)) |
| 546 | }) |
| 547 | |
| 548 | test_that("collocationAnalysis handles addExamples parameter", { |
| 549 | skip_if_offline() |
| 550 | kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE) |
| 551 | |
| 552 | expect_warning( |
| 553 | result <- collocationAnalysis( |
| 554 | kco, |
| 555 | "Test", |
| 556 | addExamples = TRUE, |
| 557 | searchHitsSampleLimit = 3, |
| 558 | topCollocatesLimit = 3 |
| 559 | ), |
| 560 | "access token" |
| 561 | ) |
| 562 | expect_true(is.data.frame(result)) |
| 563 | if (nrow(result) > 0) { |
| 564 | expect_true("example" %in% colnames(result)) |
| 565 | } |
| 566 | }) |
| 567 | |
| 568 | test_that("collocationAnalysis handles maxRecurse parameter", { |
| 569 | skip_if_offline() |
| 570 | kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE) |
| 571 | |
| 572 | expect_warning( |
| 573 | result <- collocationAnalysis( |
| 574 | kco, |
| 575 | "Test", |
| 576 | maxRecurse = 1, |
| 577 | searchHitsSampleLimit = 2, |
| 578 | topCollocatesLimit = 2 |
| 579 | ), |
| 580 | "access token" |
| 581 | ) |
| 582 | expect_true(is.data.frame(result)) |
| 583 | }) |