tests/testthat/test-collocations.R - KorAP/RKorAPClient - Gitiles

 test_that("collocationScoreQuery works", {
   skip_if_offline()
   kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = TRUE)
   df <- collocationScoreQuery(kco, "Ameisenplage", "heimgesucht", leftContextSize = 0, rightContextSize = 1)
   expect_gt(df$logDice, 1)
   expect_equal(df$ll, ll(df$O1, df$O2, df$O, df$N, df$E, df$w))
   expect_equal(df$pmi, pmi(df$O1, df$O2, df$O, df$N, df$E, df$w))
   expect_equal(df$mi2, mi2(df$O1, df$O2, df$O, df$N, df$E, df$w))
   expect_equal(df$mi3, mi3(df$O1, df$O2, df$O, df$N, df$E, df$w))
   expect_equal(df$logDice, logDice(df$O1, df$O2, df$O, df$N, df$E, df$w))
 })


 test_that("collocationAnalysis works and warns about missing token", {
   skip_if_offline()
   kco <- KorAPConnection(
     accessToken = NULL,
     verbose = TRUE
   )
   expect_warning(
     df <-
       collocationAnalysis(
         kco,
         "focus([tt/p=ADJA] {Newstickeritis})",
         leftContextSize = 1,
         rightContextSize = 0,
       ),
     "access token"
   )
   expect_gt(df$O, df$E)
   expect_gt(df$logDice, -1)
 })

 test_that("collocationAnalysis on unaccounted strings does not error out", {
   skip_if_offline()
   kco <- KorAPConnection(accessToken = NULL, verbose = TRUE)
   expect_warning(
     df <- collocationAnalysis(kco, "XXXXXXXXAmeisenplage", vc = c("corpusSigle=/WDD17/", "corpusSigle=/WUD17/"), maxRecurse = 2),
     "access token"
   )
   testthat::expect_equal(nrow(df), 0)
 })

 # test_that("removeWithinSpanWorks", {
 #  expect_equal(
 #    removeWithinSpan("contains(<base/s=s>, (machen []{0,1} aufmerksam | aufmerksam []{0,1} machen))", "base/s=s"),
 #    "(machen []{0,1} aufmerksam | aufmerksam []{0,1} machen)")
 # })


 test_that("mergeDuplicateCollocatesWorksAsExpected", {
   ldf <- tibble::tibble(
     node = c("focus(in [tt/p=NN] {[tt/l=nehmen]})"),
     collocate = c("Anspruch"),
     label = c(""),
     vc = c(""),
     query = c("Anspruch focus(in [tt/p=NN] {[tt/l=nehmen]})"),
     webUIRequestUrl = c(
       "https://korap.ids-mannheim.de/?q=Anspruch%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dnehmen%5d%7d%29&ql=poliqarp"
     ),
     w = c(1),
     leftContextSize = c(1),
     rightContextSize = c(0),
     N = c(23578528381.5),
     O = c(0.5),
     O1 = c(1168410.5),
     O2 = c(1296870.5),
     E = c(64.2651265093014),
     pmi = c(11.9173498777957),
     mi2 = c(29.8406639214616),
     mi3 = c(47.7639779651274),
     logDice = c(11.6899933757298),
     ll = c(3717716.74208791)
   )
   rdf <- tibble::tibble(
     node = c("focus({[tt/l=nehmen] in} [tt/p=NN])"),
     collocate = c("Anspruch"),
     label = c(""),
     vc = c(""),
     query = c("focus({[tt/l=nehmen] in} [tt/p=NN]) Anspruch"),
     webUIRequestUrl = c(
       "https://korap.ids-mannheim.de/?q=focus%28%7b%5btt%2fl%3dnehmen%5d%20in%7d%20%5btt%2fp%3dNN%5d%29%20Anspruch&ql=poliqarp"
     ),
     w = c(1),
     leftContextSize = c(0),
     rightContextSize = c(1),
     N = c(23578528381.5),
     O = c(0.5),
     O1 = c(17077.5),
     O2 = c(1296870.5),
     E = c(0.939299756346416),
     pmi = c(7.99469408391783),
     mi2 = c(15.8990457079122),
     mi3 = c(23.8033973319065),
     logDice = c(2.57887487309409),
     ll = c(2181.35986032019)
   )
   merged <- mergeDuplicateCollocates(ldf, rdf, smoothingConstant = 0.5)
   expect_equal(merged$O, 0.5)
   expect_equal(merged$O1, 1185487.5)
   expect_equal(merged$O2, 1296870.5)
   expect_equal(merged$query, "Anspruch focus(in [tt/p=NN] {[tt/l=nehmen]}) | focus({[tt/l=nehmen] in} [tt/p=NN]) Anspruch")
 })

 test_that("add_multi_vc_comparisons adds favorite columns", {
   sample_result <- tibble::tibble(
     node = c("n", "n"),
     collocate = c("c", "c"),
     vc = c("vc1", "vc2"),
     label = c("A", "B"),
     N = c(100, 100),
     O = c(10, 20),
     O1 = c(50, 50),
     O2 = c(30, 30),
     E = c(5, 5),
     w = c(2, 2),
     leftContextSize = c(1, 1),
     rightContextSize = c(1, 1),
     frequency = c(10, 20),
     logDice = c(5, 7),
     pmi = c(2, 3)
   )

   enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result)

   expect_true(all(c(
     "winner_logDice",
     "winner_logDice_value",
     "runner_up_logDice",
     "runner_up_logDice_value",
     "max_delta_logDice",
     "winner_rank_logDice",
     "winner_rank_logDice_value",
     "runner_up_rank_logDice",
     "runner_up_rank_logDice_value",
     "loser_rank_logDice",
     "loser_rank_logDice_value",
     "max_delta_rank_logDice",
     "winner_percentile_rank_logDice",
     "winner_percentile_rank_logDice_value",
     "runner_up_percentile_rank_logDice",
     "runner_up_percentile_rank_logDice_value",
     "loser_percentile_rank_logDice",
     "loser_percentile_rank_logDice_value",
     "max_delta_percentile_rank_logDice",
     "winner_rank_pmi",
     "winner_rank_pmi_value",
     "runner_up_rank_pmi",
     "runner_up_rank_pmi_value",
     "loser_rank_pmi",
     "loser_rank_pmi_value",
     "max_delta_rank_pmi",
     "winner_percentile_rank_pmi",
     "winner_percentile_rank_pmi_value",
     "runner_up_percentile_rank_pmi",
     "runner_up_percentile_rank_pmi_value",
     "loser_percentile_rank_pmi",
     "loser_percentile_rank_pmi_value",
     "max_delta_percentile_rank_pmi",
     "rank_A_logDice",
     "rank_B_logDice",
     "rank_A_pmi",
     "rank_B_pmi",
     "percentile_rank_A_logDice",
     "percentile_rank_B_logDice",
     "percentile_rank_A_pmi",
     "percentile_rank_B_pmi",
     "delta_rank_logDice",
     "delta_rank_pmi",
     "delta_percentile_rank_logDice",
     "delta_percentile_rank_pmi"
   ) %in% colnames(enriched)))

   expect_true(all(enriched$winner_logDice == "B"))
   expect_true(all(enriched$runner_up_logDice == "A"))
   expect_true(all(enriched$winner_logDice_value >= enriched$runner_up_logDice_value))
   expect_true(all(enriched$percentile_rank_A_logDice == 1))
   expect_true(all(enriched$percentile_rank_B_logDice == 1))
   expect_true(all(enriched$delta_percentile_rank_logDice == 0))
 })

 test_that("add_multi_vc_comparisons handles more than two labels", {
   sample_result <- tibble::tibble(
     node = rep("n", 3),
     collocate = rep("c", 3),
     vc = c("vc1", "vc2", "vc3"),
     label = c("A", "B", "C"),
     N = rep(100, 3),
     O = c(10, 30, 5),
     O1 = rep(50, 3),
     O2 = rep(30, 3),
     E = rep(5, 3),
     w = rep(2, 3),
     leftContextSize = rep(1, 3),
     rightContextSize = rep(1, 3),
     frequency = c(10, 30, 5),
     logDice = c(5, 8, 4),
     pmi = c(2, 3, 1)
   )

   enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result)
   expect_equal(enriched$winner_logDice[1], "B")
   expect_equal(enriched$winner_logDice_value[1], 8)
   expect_equal(enriched$runner_up_logDice[1], "A")
   expect_equal(enriched$runner_up_logDice_value[1], 5)
   expect_equal(enriched$loser_logDice[1], "C")
   expect_equal(enriched$loser_logDice_value[1], 4)
   expect_equal(enriched$max_delta_logDice[1], 4)
 })

 test_that("add_multi_vc_comparisons computes rank deltas", {
   base_tbl <- tidyr::expand_grid(
     label = c("A", "B", "C"),
     collocate = c("c1", "c2", "c3")
   ) |>
     dplyr::mutate(
       node = "n",
       vc = paste0("vc", label),
       N = 100,
       O = 10,
       O1 = 50,
       O2 = 40,
       E = 5,
       w = 2,
       leftContextSize = 1,
       rightContextSize = 1,
       frequency = 10,
       logDice = dplyr::case_when(
         label == "A" & collocate == "c1" ~ 9,
         label == "A" & collocate == "c2" ~ 6,
         label == "A" & collocate == "c3" ~ 3,
         label == "B" & collocate == "c1" ~ 7,
         label == "B" & collocate == "c2" ~ 9,
         label == "B" & collocate == "c3" ~ 5,
         label == "C" & collocate == "c1" ~ 4,
         label == "C" & collocate == "c2" ~ 6,
         label == "C" & collocate == "c3" ~ 8,
         TRUE ~ 0
       )
     )

   enriched <- RKorAPClient:::add_multi_vc_comparisons(base_tbl)
   target_row <- enriched |>
     dplyr::filter(collocate == "c1") |>
     dplyr::slice_head(n = 1)

   expect_equal(target_row$rank_A_logDice, 1)
   expect_equal(target_row$rank_B_logDice, 2)
   expect_equal(target_row$rank_C_logDice, 3)
   expect_equal(target_row$winner_rank_logDice, "A")
   expect_equal(target_row$winner_rank_logDice_value, 1)
   expect_equal(target_row$runner_up_rank_logDice, "B")
   expect_equal(target_row$runner_up_rank_logDice_value, 2)
   expect_equal(target_row$loser_rank_logDice, "C")
   expect_equal(target_row$loser_rank_logDice_value, 3)
   expect_equal(target_row$max_delta_rank_logDice, 2)
 })

 test_that("add_multi_vc_comparisons imputes missing ranks for max delta", {
   sample_result <- tibble::tibble(
     node = c("n", "n"),
     collocate = c("c", "c"),
     vc = c("vc1", "vc2"),
     label = c("A", "B"),
     N = c(100, 100),
     O = c(10, 10),
     O1 = c(50, 50),
     O2 = c(30, 30),
     E = c(5, 5),
     w = c(2, 2),
     leftContextSize = c(1, 1),
     rightContextSize = c(1, 1),
     frequency = c(10, 10),
     logDice = c(5, NA)
   )

   enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result)

   expect_equal(enriched$rank_A_logDice[1], 1)
   expect_true(is.na(enriched$rank_B_logDice[1]))
   expect_equal(enriched$winner_rank_logDice[1], "A")
   expect_equal(enriched$loser_rank_logDice[1], "B")
   expect_equal(enriched$loser_rank_logDice_value[1], 2)
   expect_equal(enriched$max_delta_rank_logDice[1], 1)
 })

 test_that("adaptive missing score imputation respects measure-specific scales", {
   sample_result <- tibble::tibble(
     node = c("n", "n", "n"),
     collocate = c("c", "c", "c"),
     vc = c("vc1", "vc2", "vc3"),
     label = c("A", "B", "C"),
     N = c(100, 100, 100),
     O = c(12, 9, 7),
     O1 = c(60, 40, 30),
     O2 = c(33, 22, 18),
     E = c(6, 6, 6),
     w = c(2, 2, 2),
     leftContextSize = c(1, 1, 1),
     rightContextSize = c(1, 1, 1),
     frequency = c(15, 11, 9),
     logDice = c(-0.31, NA, -0.12),
     pmi = c(-1.65, NA, -0.48),
     ll = c(12.4, NA, 7.9)
   )

   enriched <- RKorAPClient:::add_multi_vc_comparisons(
     sample_result,
     missingScoreQuantile = 0.05
   )

   row_a <- dplyr::filter(enriched, label == "A") |> dplyr::slice_head(n = 1)

   expect_false(is.na(row_a$logDice_B))
   expect_false(is.na(row_a$pmi_B))
   expect_false(is.na(row_a$ll_B))

   expect_lt(row_a$logDice_B, min(sample_result$logDice, na.rm = TRUE))
   expect_lt(row_a$pmi_B, min(sample_result$pmi, na.rm = TRUE))
   expect_lte(row_a$ll_B, min(sample_result$ll, na.rm = TRUE))

   expect_gt(row_a$max_delta_logDice, 0)
   expect_gt(row_a$winner_logDice_value - row_a$loser_logDice_value, 0)
 })

 # New tests for improved coverage of collocationAnalysis.R helper functions

 test_that("synsemanticStopwords returns German stopwords", {
   stopwords <- synsemanticStopwords()
   expect_true(is.character(stopwords))
   expect_true(length(stopwords) > 50)
   expect_true("der" %in% stopwords)
   expect_true("die" %in% stopwords)
   expect_true("und" %in% stopwords)
   expect_true("mit" %in% stopwords)
 })

 test_that("removeWithinSpan removes span constraints correctly", {
   # Test basic span removal
   query1 <- "contains(<base/s=s>, (machen []{0,1} aufmerksam | aufmerksam []{0,1} machen))"
   result1 <- RKorAPClient:::removeWithinSpan(query1, "base/s=s")
   expect_equal(result1, "(machen []{0,1} aufmerksam | aufmerksam []{0,1} machen)")

   # Test with different span
   query2 <- "contains(<p/s=s>, (test query))"
   result2 <- RKorAPClient:::removeWithinSpan(query2, "p/s=s")
   expect_equal(result2, "(test query)")

   # Test with empty span - should return original query
   query3 <- "simple query"
   result3 <- RKorAPClient:::removeWithinSpan(query3, "")
   expect_equal(result3, query3)

   # Test with non-matching span
   query4 <- "contains(<base/s=s>, test)"
   result4 <- RKorAPClient:::removeWithinSpan(query4, "other/span")
   expect_equal(result4, query4)
 })

 test_that("matches2FreqTable handles empty matches", {
   empty_matches <- data.frame()
   result <- RKorAPClient:::matches2FreqTable(empty_matches, index = 0)

   expect_true(is.data.frame(result))
   expect_equal(nrow(result), 0)
 })

 test_that("matches2FreqTable processes single match correctly", {
   # Create mock matches data
   mock_matches <- data.frame(
     tokens = I(list(list(
       left = c("der", "große"),
       match = "Test",
       right = c("ist", "wichtig")
     ))),
     stringsAsFactors = FALSE
   )

   result <- RKorAPClient:::matches2FreqTable(
     mock_matches,
     index = 1,
     leftContextSize = 2,
     rightContextSize = 2,
     stopwords = c("der", "ist") # Provide stopwords to avoid empty join
   )

   expect_true(is.data.frame(result))
 })

 test_that("snippet2FreqTable handles empty snippet", {
   result <- RKorAPClient:::snippet2FreqTable(character(0))

   expect_true(is.data.frame(result))
   expect_equal(nrow(result), 0)
 })

 test_that("snippet2FreqTable processes single snippet correctly", {
   snippet <- '<span class="context-left">der große </span><span class="match"><mark>Test</mark></span><span class="context-right"> ist wichtig</span>'

   result <- RKorAPClient:::snippet2FreqTable(
     snippet,
     leftContextSize = 2,
     rightContextSize = 2,
     stopwords = c("der"), # Provide stopwords to avoid empty join
     verbose = FALSE
   )

   expect_true(is.data.frame(result))
 })

 test_that("inject_focus_into_query adds focus wrappers when span queries lack them", {
   unfocused <- "contains(<base/s=s>, (Anspruch [tt/l=nehmen] | [tt/l=nehmen] Anspruch))"
   focused <- RKorAPClient:::inject_focus_into_query(unfocused)

   expect_equal(
     focused,
     "contains(<base/s=s>, (focus({Anspruch [tt/l=nehmen]}) | focus({[tt/l=nehmen] Anspruch})))"
   )
 })

 test_that("inject_focus_into_query leaves existing focus segments unchanged", {
   already_focused <- "contains(<base/s=s>, (focus({Anspruch [tt/l=nehmen]})))"
   expect_identical(
     RKorAPClient:::inject_focus_into_query(already_focused),
     already_focused
   )
 })

 # Removed hanging findExample tests as they cause infinite wait
 # These tests make API calls that don't complete properly

 # Removed hanging collocatesQuery tests as they cause infinite wait
 # These tests were causing the test suite to hang and not terminate

 test_that("collocationAnalysis handles exactFrequencies parameter", {
   skip_if_offline()
   kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

   expect_warning(
     result <- collocationAnalysis(
       kco,
       "Test",
       exactFrequencies = TRUE,
       searchHitsSampleLimit = 5,
       topCollocatesLimit = 5
     ),
     "access token"
   )
   expect_true(is.data.frame(result))
 })

 test_that("collocationAnalysis handles withinSpan parameter", {
   skip_if_offline()
   kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

   expect_warning(
     result <- collocationAnalysis(
       kco,
       "Test",
       withinSpan = "base/s=s",
       exactFrequencies = TRUE,
       searchHitsSampleLimit = 5,
       topCollocatesLimit = 5
     ),
     "access token"
   )
   expect_true(is.data.frame(result))
 })

 test_that("collocationAnalysis handles expand parameter", {
   skip_if_offline()
   kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

   expect_warning(
     result <- collocationAnalysis(
       kco,
       c("Test", "der"),
       expand = TRUE,
       searchHitsSampleLimit = 2,
       topCollocatesLimit = 2
     ),
     "access token"
   )
   expect_true(is.data.frame(result))
 })

 test_that("collocationAnalysis honors named vc labels", {
   skip_if_offline()
   kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

   named_vc <- c(
     Western = "textType=/.*Western.*/ & pubDate in 2012",
     Erotic = "textType=/.*(Erotik|Gay).*/ & pubDate in 2012",
     Historic = "textType=/.*Historisch.*/ & pubDate in 2012"
   )

   expect_warning(
     result <- collocationAnalysis(
       kco,
       "[tt/l=treffen]",
       vc = named_vc,
       searchHitsSampleLimit = 2,
       topCollocatesLimit = 2
     ),
     "access token"
   )

   if (nrow(result) > 0) {
     expect_true("label" %in% colnames(result))
     expect_setequal(unique(result$label), names(named_vc))
   }
 })

 test_that("collocationAnalysis handles stopwords parameter", {
   skip_if_offline()
   kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

   expect_warning(
     result <- collocationAnalysis(
       kco,
       "Test",
       stopwords = c("der", "die", "und"),
       searchHitsSampleLimit = 5,
       topCollocatesLimit = 5
     ),
     "access token"
   )
   expect_true(is.data.frame(result))
 })

 test_that("collocationAnalysis handles lemmatizeNodeQuery parameter", {
   skip_if_offline()
   kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

   expect_warning(
     result <- collocationAnalysis(
       kco,
       "laufen",
       lemmatizeNodeQuery = TRUE,
       searchHitsSampleLimit = 5,
       topCollocatesLimit = 5
     ),
     "access token"
   )
   expect_true(is.data.frame(result))
 })

 test_that("collocationAnalysis handles addExamples parameter", {
   skip_if_offline()
   kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

   expect_warning(
     result <- collocationAnalysis(
       kco,
       "Test",
       addExamples = TRUE,
       searchHitsSampleLimit = 3,
       topCollocatesLimit = 3
     ),
     "access token"
   )
   expect_true(is.data.frame(result))
   if (nrow(result) > 0) {
     expect_true("example" %in% colnames(result))
   }
 })

 test_that("collocationAnalysis handles maxRecurse parameter", {
   skip_if_offline()
   kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

   expect_warning(
     result <- collocationAnalysis(
       kco,
       "Test",
       maxRecurse = 1,
       searchHitsSampleLimit = 2,
       topCollocatesLimit = 2
     ),
     "access token"
   )
   expect_true(is.data.frame(result))
 })
	test_that("collocationScoreQuery works", {
	skip_if_offline()
	kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = TRUE)
	df <- collocationScoreQuery(kco, "Ameisenplage", "heimgesucht", leftContextSize = 0, rightContextSize = 1)
	expect_gt(df$logDice, 1)
	expect_equal(df$ll, ll(df$O1, df$O2, df$O, df$N, df$E, df$w))
	expect_equal(df$pmi, pmi(df$O1, df$O2, df$O, df$N, df$E, df$w))
	expect_equal(df$mi2, mi2(df$O1, df$O2, df$O, df$N, df$E, df$w))
	expect_equal(df$mi3, mi3(df$O1, df$O2, df$O, df$N, df$E, df$w))
	expect_equal(df$logDice, logDice(df$O1, df$O2, df$O, df$N, df$E, df$w))
	})


	test_that("collocationAnalysis works and warns about missing token", {
	skip_if_offline()
	kco <- KorAPConnection(
	accessToken = NULL,
	verbose = TRUE
	)
	expect_warning(
	df <-
	collocationAnalysis(
	kco,
	"focus([tt/p=ADJA] {Newstickeritis})",
	leftContextSize = 1,
	rightContextSize = 0,
	),
	"access token"
	)
	expect_gt(df$O, df$E)
	expect_gt(df$logDice, -1)
	})

	test_that("collocationAnalysis on unaccounted strings does not error out", {
	skip_if_offline()
	kco <- KorAPConnection(accessToken = NULL, verbose = TRUE)
	expect_warning(
	df <- collocationAnalysis(kco, "XXXXXXXXAmeisenplage", vc = c("corpusSigle=/WDD17/", "corpusSigle=/WUD17/"), maxRecurse = 2),
	"access token"
	)
	testthat::expect_equal(nrow(df), 0)
	})

	# test_that("removeWithinSpanWorks", {
	# expect_equal(
	# removeWithinSpan("contains(<base/s=s>, (machen []{0,1} aufmerksam \| aufmerksam []{0,1} machen))", "base/s=s"),
	# "(machen []{0,1} aufmerksam \| aufmerksam []{0,1} machen)")
	# })


	test_that("mergeDuplicateCollocatesWorksAsExpected", {
	ldf <- tibble::tibble(
	node = c("focus(in [tt/p=NN] {[tt/l=nehmen]})"),
	collocate = c("Anspruch"),
	label = c(""),
	vc = c(""),
	query = c("Anspruch focus(in [tt/p=NN] {[tt/l=nehmen]})"),
	webUIRequestUrl = c(
	"https://korap.ids-mannheim.de/?q=Anspruch%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dnehmen%5d%7d%29&ql=poliqarp"
	),
	w = c(1),
	leftContextSize = c(1),
	rightContextSize = c(0),
	N = c(23578528381.5),
	O = c(0.5),
	O1 = c(1168410.5),
	O2 = c(1296870.5),
	E = c(64.2651265093014),
	pmi = c(11.9173498777957),
	mi2 = c(29.8406639214616),
	mi3 = c(47.7639779651274),
	logDice = c(11.6899933757298),
	ll = c(3717716.74208791)
	)
	rdf <- tibble::tibble(
	node = c("focus({[tt/l=nehmen] in} [tt/p=NN])"),
	collocate = c("Anspruch"),
	label = c(""),
	vc = c(""),
	query = c("focus({[tt/l=nehmen] in} [tt/p=NN]) Anspruch"),
	webUIRequestUrl = c(
	"https://korap.ids-mannheim.de/?q=focus%28%7b%5btt%2fl%3dnehmen%5d%20in%7d%20%5btt%2fp%3dNN%5d%29%20Anspruch&ql=poliqarp"
	),
	w = c(1),
	leftContextSize = c(0),
	rightContextSize = c(1),
	N = c(23578528381.5),
	O = c(0.5),
	O1 = c(17077.5),
	O2 = c(1296870.5),
	E = c(0.939299756346416),
	pmi = c(7.99469408391783),
	mi2 = c(15.8990457079122),
	mi3 = c(23.8033973319065),
	logDice = c(2.57887487309409),
	ll = c(2181.35986032019)
	)
	merged <- mergeDuplicateCollocates(ldf, rdf, smoothingConstant = 0.5)
	expect_equal(merged$O, 0.5)
	expect_equal(merged$O1, 1185487.5)
	expect_equal(merged$O2, 1296870.5)
	expect_equal(merged$query, "Anspruch focus(in [tt/p=NN] {[tt/l=nehmen]}) \| focus({[tt/l=nehmen] in} [tt/p=NN]) Anspruch")
	})

	test_that("add_multi_vc_comparisons adds favorite columns", {
	sample_result <- tibble::tibble(
	node = c("n", "n"),
	collocate = c("c", "c"),
	vc = c("vc1", "vc2"),
	label = c("A", "B"),
	N = c(100, 100),
	O = c(10, 20),
	O1 = c(50, 50),
	O2 = c(30, 30),
	E = c(5, 5),
	w = c(2, 2),
	leftContextSize = c(1, 1),
	rightContextSize = c(1, 1),
	frequency = c(10, 20),
	logDice = c(5, 7),
	pmi = c(2, 3)
	)

	enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result)

	expect_true(all(c(
	"winner_logDice",
	"winner_logDice_value",
	"runner_up_logDice",
	"runner_up_logDice_value",
	"max_delta_logDice",
	"winner_rank_logDice",
	"winner_rank_logDice_value",
	"runner_up_rank_logDice",
	"runner_up_rank_logDice_value",
	"loser_rank_logDice",
	"loser_rank_logDice_value",
	"max_delta_rank_logDice",
	"winner_percentile_rank_logDice",
	"winner_percentile_rank_logDice_value",
	"runner_up_percentile_rank_logDice",
	"runner_up_percentile_rank_logDice_value",
	"loser_percentile_rank_logDice",
	"loser_percentile_rank_logDice_value",
	"max_delta_percentile_rank_logDice",
	"winner_rank_pmi",
	"winner_rank_pmi_value",
	"runner_up_rank_pmi",
	"runner_up_rank_pmi_value",
	"loser_rank_pmi",
	"loser_rank_pmi_value",
	"max_delta_rank_pmi",
	"winner_percentile_rank_pmi",
	"winner_percentile_rank_pmi_value",
	"runner_up_percentile_rank_pmi",
	"runner_up_percentile_rank_pmi_value",
	"loser_percentile_rank_pmi",
	"loser_percentile_rank_pmi_value",
	"max_delta_percentile_rank_pmi",
	"rank_A_logDice",
	"rank_B_logDice",
	"rank_A_pmi",
	"rank_B_pmi",
	"percentile_rank_A_logDice",
	"percentile_rank_B_logDice",
	"percentile_rank_A_pmi",
	"percentile_rank_B_pmi",
	"delta_rank_logDice",
	"delta_rank_pmi",
	"delta_percentile_rank_logDice",
	"delta_percentile_rank_pmi"
	) %in% colnames(enriched)))

	expect_true(all(enriched$winner_logDice == "B"))
	expect_true(all(enriched$runner_up_logDice == "A"))
	expect_true(all(enriched$winner_logDice_value >= enriched$runner_up_logDice_value))
	expect_true(all(enriched$percentile_rank_A_logDice == 1))
	expect_true(all(enriched$percentile_rank_B_logDice == 1))
	expect_true(all(enriched$delta_percentile_rank_logDice == 0))
	})

	test_that("add_multi_vc_comparisons handles more than two labels", {
	sample_result <- tibble::tibble(
	node = rep("n", 3),
	collocate = rep("c", 3),
	vc = c("vc1", "vc2", "vc3"),
	label = c("A", "B", "C"),
	N = rep(100, 3),
	O = c(10, 30, 5),
	O1 = rep(50, 3),
	O2 = rep(30, 3),
	E = rep(5, 3),
	w = rep(2, 3),
	leftContextSize = rep(1, 3),
	rightContextSize = rep(1, 3),
	frequency = c(10, 30, 5),
	logDice = c(5, 8, 4),
	pmi = c(2, 3, 1)
	)

	enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result)
	expect_equal(enriched$winner_logDice[1], "B")
	expect_equal(enriched$winner_logDice_value[1], 8)
	expect_equal(enriched$runner_up_logDice[1], "A")
	expect_equal(enriched$runner_up_logDice_value[1], 5)
	expect_equal(enriched$loser_logDice[1], "C")
	expect_equal(enriched$loser_logDice_value[1], 4)
	expect_equal(enriched$max_delta_logDice[1], 4)
	})

	test_that("add_multi_vc_comparisons computes rank deltas", {
	base_tbl <- tidyr::expand_grid(
	label = c("A", "B", "C"),
	collocate = c("c1", "c2", "c3")
	) \|>
	dplyr::mutate(
	node = "n",
	vc = paste0("vc", label),
	N = 100,
	O = 10,
	O1 = 50,
	O2 = 40,
	E = 5,
	w = 2,
	leftContextSize = 1,
	rightContextSize = 1,
	frequency = 10,
	logDice = dplyr::case_when(
	label == "A" & collocate == "c1" ~ 9,
	label == "A" & collocate == "c2" ~ 6,
	label == "A" & collocate == "c3" ~ 3,
	label == "B" & collocate == "c1" ~ 7,
	label == "B" & collocate == "c2" ~ 9,
	label == "B" & collocate == "c3" ~ 5,
	label == "C" & collocate == "c1" ~ 4,
	label == "C" & collocate == "c2" ~ 6,
	label == "C" & collocate == "c3" ~ 8,
	TRUE ~ 0
	)
	)

	enriched <- RKorAPClient:::add_multi_vc_comparisons(base_tbl)
	target_row <- enriched \|>
	dplyr::filter(collocate == "c1") \|>
	dplyr::slice_head(n = 1)

	expect_equal(target_row$rank_A_logDice, 1)
	expect_equal(target_row$rank_B_logDice, 2)
	expect_equal(target_row$rank_C_logDice, 3)
	expect_equal(target_row$winner_rank_logDice, "A")
	expect_equal(target_row$winner_rank_logDice_value, 1)
	expect_equal(target_row$runner_up_rank_logDice, "B")
	expect_equal(target_row$runner_up_rank_logDice_value, 2)
	expect_equal(target_row$loser_rank_logDice, "C")
	expect_equal(target_row$loser_rank_logDice_value, 3)
	expect_equal(target_row$max_delta_rank_logDice, 2)
	})

	test_that("add_multi_vc_comparisons imputes missing ranks for max delta", {
	sample_result <- tibble::tibble(
	node = c("n", "n"),
	collocate = c("c", "c"),
	vc = c("vc1", "vc2"),
	label = c("A", "B"),
	N = c(100, 100),
	O = c(10, 10),
	O1 = c(50, 50),
	O2 = c(30, 30),
	E = c(5, 5),
	w = c(2, 2),
	leftContextSize = c(1, 1),
	rightContextSize = c(1, 1),
	frequency = c(10, 10),
	logDice = c(5, NA)
	)

	enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result)

	expect_equal(enriched$rank_A_logDice[1], 1)
	expect_true(is.na(enriched$rank_B_logDice[1]))
	expect_equal(enriched$winner_rank_logDice[1], "A")
	expect_equal(enriched$loser_rank_logDice[1], "B")
	expect_equal(enriched$loser_rank_logDice_value[1], 2)
	expect_equal(enriched$max_delta_rank_logDice[1], 1)
	})

	test_that("adaptive missing score imputation respects measure-specific scales", {
	sample_result <- tibble::tibble(
	node = c("n", "n", "n"),
	collocate = c("c", "c", "c"),
	vc = c("vc1", "vc2", "vc3"),
	label = c("A", "B", "C"),
	N = c(100, 100, 100),
	O = c(12, 9, 7),
	O1 = c(60, 40, 30),
	O2 = c(33, 22, 18),
	E = c(6, 6, 6),
	w = c(2, 2, 2),
	leftContextSize = c(1, 1, 1),
	rightContextSize = c(1, 1, 1),
	frequency = c(15, 11, 9),
	logDice = c(-0.31, NA, -0.12),
	pmi = c(-1.65, NA, -0.48),
	ll = c(12.4, NA, 7.9)
	)

	enriched <- RKorAPClient:::add_multi_vc_comparisons(
	sample_result,
	missingScoreQuantile = 0.05
	)

	row_a <- dplyr::filter(enriched, label == "A") \|> dplyr::slice_head(n = 1)

	expect_false(is.na(row_a$logDice_B))
	expect_false(is.na(row_a$pmi_B))
	expect_false(is.na(row_a$ll_B))

	expect_lt(row_a$logDice_B, min(sample_result$logDice, na.rm = TRUE))
	expect_lt(row_a$pmi_B, min(sample_result$pmi, na.rm = TRUE))
	expect_lte(row_a$ll_B, min(sample_result$ll, na.rm = TRUE))

	expect_gt(row_a$max_delta_logDice, 0)
	expect_gt(row_a$winner_logDice_value - row_a$loser_logDice_value, 0)
	})

	# New tests for improved coverage of collocationAnalysis.R helper functions

	test_that("synsemanticStopwords returns German stopwords", {
	stopwords <- synsemanticStopwords()
	expect_true(is.character(stopwords))
	expect_true(length(stopwords) > 50)
	expect_true("der" %in% stopwords)
	expect_true("die" %in% stopwords)
	expect_true("und" %in% stopwords)
	expect_true("mit" %in% stopwords)
	})

	test_that("removeWithinSpan removes span constraints correctly", {
	# Test basic span removal
	query1 <- "contains(<base/s=s>, (machen []{0,1} aufmerksam \| aufmerksam []{0,1} machen))"
	result1 <- RKorAPClient:::removeWithinSpan(query1, "base/s=s")
	expect_equal(result1, "(machen []{0,1} aufmerksam \| aufmerksam []{0,1} machen)")

	# Test with different span
	query2 <- "contains(<p/s=s>, (test query))"
	result2 <- RKorAPClient:::removeWithinSpan(query2, "p/s=s")
	expect_equal(result2, "(test query)")

	# Test with empty span - should return original query
	query3 <- "simple query"
	result3 <- RKorAPClient:::removeWithinSpan(query3, "")
	expect_equal(result3, query3)

	# Test with non-matching span
	query4 <- "contains(<base/s=s>, test)"
	result4 <- RKorAPClient:::removeWithinSpan(query4, "other/span")
	expect_equal(result4, query4)
	})

	test_that("matches2FreqTable handles empty matches", {
	empty_matches <- data.frame()
	result <- RKorAPClient:::matches2FreqTable(empty_matches, index = 0)

	expect_true(is.data.frame(result))
	expect_equal(nrow(result), 0)
	})

	test_that("matches2FreqTable processes single match correctly", {
	# Create mock matches data
	mock_matches <- data.frame(
	tokens = I(list(list(
	left = c("der", "große"),
	match = "Test",
	right = c("ist", "wichtig")
	))),
	stringsAsFactors = FALSE
	)

	result <- RKorAPClient:::matches2FreqTable(
	mock_matches,
	index = 1,
	leftContextSize = 2,
	rightContextSize = 2,
	stopwords = c("der", "ist") # Provide stopwords to avoid empty join
	)

	expect_true(is.data.frame(result))
	})

	test_that("snippet2FreqTable handles empty snippet", {
	result <- RKorAPClient:::snippet2FreqTable(character(0))

	expect_true(is.data.frame(result))
	expect_equal(nrow(result), 0)
	})

	test_that("snippet2FreqTable processes single snippet correctly", {
	snippet <- '<span class="context-left">der große </span><span class="match"><mark>Test</mark></span><span class="context-right"> ist wichtig</span>'

	result <- RKorAPClient:::snippet2FreqTable(
	snippet,
	leftContextSize = 2,
	rightContextSize = 2,
	stopwords = c("der"), # Provide stopwords to avoid empty join
	verbose = FALSE
	)

	expect_true(is.data.frame(result))
	})

	test_that("inject_focus_into_query adds focus wrappers when span queries lack them", {
	unfocused <- "contains(<base/s=s>, (Anspruch [tt/l=nehmen] \| [tt/l=nehmen] Anspruch))"
	focused <- RKorAPClient:::inject_focus_into_query(unfocused)

	expect_equal(
	focused,
	"contains(<base/s=s>, (focus({Anspruch [tt/l=nehmen]}) \| focus({[tt/l=nehmen] Anspruch})))"
	)
	})

	test_that("inject_focus_into_query leaves existing focus segments unchanged", {
	already_focused <- "contains(<base/s=s>, (focus({Anspruch [tt/l=nehmen]})))"
	expect_identical(
	RKorAPClient:::inject_focus_into_query(already_focused),
	already_focused
	)
	})

	# Removed hanging findExample tests as they cause infinite wait
	# These tests make API calls that don't complete properly

	# Removed hanging collocatesQuery tests as they cause infinite wait
	# These tests were causing the test suite to hang and not terminate

	test_that("collocationAnalysis handles exactFrequencies parameter", {
	skip_if_offline()
	kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

	expect_warning(
	result <- collocationAnalysis(
	kco,
	"Test",
	exactFrequencies = TRUE,
	searchHitsSampleLimit = 5,
	topCollocatesLimit = 5
	),
	"access token"
	)
	expect_true(is.data.frame(result))
	})

	test_that("collocationAnalysis handles withinSpan parameter", {
	skip_if_offline()
	kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

	expect_warning(
	result <- collocationAnalysis(
	kco,
	"Test",
	withinSpan = "base/s=s",
	exactFrequencies = TRUE,
	searchHitsSampleLimit = 5,
	topCollocatesLimit = 5
	),
	"access token"
	)
	expect_true(is.data.frame(result))
	})

	test_that("collocationAnalysis handles expand parameter", {
	skip_if_offline()
	kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

	expect_warning(
	result <- collocationAnalysis(
	kco,
	c("Test", "der"),
	expand = TRUE,
	searchHitsSampleLimit = 2,
	topCollocatesLimit = 2
	),
	"access token"
	)
	expect_true(is.data.frame(result))
	})

	test_that("collocationAnalysis honors named vc labels", {
	skip_if_offline()
	kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

	named_vc <- c(
	Western = "textType=/.Western./ & pubDate in 2012",
	Erotic = "textType=/.(Erotik\|Gay)./ & pubDate in 2012",
	Historic = "textType=/.Historisch./ & pubDate in 2012"
	)

	expect_warning(
	result <- collocationAnalysis(
	kco,
	"[tt/l=treffen]",
	vc = named_vc,
	searchHitsSampleLimit = 2,
	topCollocatesLimit = 2
	),
	"access token"
	)

	if (nrow(result) > 0) {
	expect_true("label" %in% colnames(result))
	expect_setequal(unique(result$label), names(named_vc))
	}
	})

	test_that("collocationAnalysis handles stopwords parameter", {
	skip_if_offline()
	kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

	expect_warning(
	result <- collocationAnalysis(
	kco,
	"Test",
	stopwords = c("der", "die", "und"),
	searchHitsSampleLimit = 5,
	topCollocatesLimit = 5
	),
	"access token"
	)
	expect_true(is.data.frame(result))
	})

	test_that("collocationAnalysis handles lemmatizeNodeQuery parameter", {
	skip_if_offline()
	kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

	expect_warning(
	result <- collocationAnalysis(
	kco,
	"laufen",
	lemmatizeNodeQuery = TRUE,
	searchHitsSampleLimit = 5,
	topCollocatesLimit = 5
	),
	"access token"
	)
	expect_true(is.data.frame(result))
	})

	test_that("collocationAnalysis handles addExamples parameter", {
	skip_if_offline()
	kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

	expect_warning(
	result <- collocationAnalysis(
	kco,
	"Test",
	addExamples = TRUE,
	searchHitsSampleLimit = 3,
	topCollocatesLimit = 3
	),
	"access token"
	)
	expect_true(is.data.frame(result))
	if (nrow(result) > 0) {
	expect_true("example" %in% colnames(result))
	}
	})

	test_that("collocationAnalysis handles maxRecurse parameter", {
	skip_if_offline()
	kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)

	expect_warning(
	result <- collocationAnalysis(
	kco,
	"Test",
	maxRecurse = 1,
	searchHitsSampleLimit = 2,
	topCollocatesLimit = 2
	),
	"access token"
	)
	expect_true(is.data.frame(result))
	})