blob: 14508f68263067e46c8087e66e43fa567de8d097 [file] [log] [blame]
test_that("collocationScoreQuery works", {
skip_if_offline()
kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = TRUE)
df <- collocationScoreQuery(kco, "Ameisenplage", "heimgesucht", leftContextSize = 0, rightContextSize = 1)
expect_gt(df$logDice, 1)
expect_equal(df$ll, ll(df$O1, df$O2, df$O, df$N, df$E, df$w))
expect_equal(df$pmi, pmi(df$O1, df$O2, df$O, df$N, df$E, df$w))
expect_equal(df$mi2, mi2(df$O1, df$O2, df$O, df$N, df$E, df$w))
expect_equal(df$mi3, mi3(df$O1, df$O2, df$O, df$N, df$E, df$w))
expect_equal(df$logDice, logDice(df$O1, df$O2, df$O, df$N, df$E, df$w))
})
test_that("collocationAnalysis works and warns about missing token", {
skip_if_offline()
kco <- KorAPConnection(
accessToken = NULL,
verbose = TRUE
)
expect_warning(
df <-
collocationAnalysis(
kco,
"focus([tt/p=ADJA] {Newstickeritis})",
leftContextSize = 1,
rightContextSize = 0,
),
"access token"
)
expect_gt(df$O, df$E)
expect_gt(df$logDice, -1)
})
test_that("collocationAnalysis on unaccounted strings does not error out", {
skip_if_offline()
kco <- KorAPConnection(accessToken = NULL, verbose = TRUE)
expect_warning(
df <- collocationAnalysis(kco, "XXXXXXXXAmeisenplage", vc = c("corpusSigle=/WDD17/", "corpusSigle=/WUD17/"), maxRecurse = 2),
"access token"
)
testthat::expect_equal(nrow(df), 0)
})
# test_that("removeWithinSpanWorks", {
# expect_equal(
# removeWithinSpan("contains(<base/s=s>, (machen []{0,1} aufmerksam | aufmerksam []{0,1} machen))", "base/s=s"),
# "(machen []{0,1} aufmerksam | aufmerksam []{0,1} machen)")
# })
test_that("mergeDuplicateCollocatesWorksAsExpected", {
ldf <- tibble::tibble(
node = c("focus(in [tt/p=NN] {[tt/l=nehmen]})"),
collocate = c("Anspruch"),
label = c(""),
vc = c(""),
query = c("Anspruch focus(in [tt/p=NN] {[tt/l=nehmen]})"),
webUIRequestUrl = c(
"https://korap.ids-mannheim.de/?q=Anspruch%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dnehmen%5d%7d%29&ql=poliqarp"
),
w = c(1),
leftContextSize = c(1),
rightContextSize = c(0),
N = c(23578528381.5),
O = c(0.5),
O1 = c(1168410.5),
O2 = c(1296870.5),
E = c(64.2651265093014),
pmi = c(11.9173498777957),
mi2 = c(29.8406639214616),
mi3 = c(47.7639779651274),
logDice = c(11.6899933757298),
ll = c(3717716.74208791)
)
rdf <- tibble::tibble(
node = c("focus({[tt/l=nehmen] in} [tt/p=NN])"),
collocate = c("Anspruch"),
label = c(""),
vc = c(""),
query = c("focus({[tt/l=nehmen] in} [tt/p=NN]) Anspruch"),
webUIRequestUrl = c(
"https://korap.ids-mannheim.de/?q=focus%28%7b%5btt%2fl%3dnehmen%5d%20in%7d%20%5btt%2fp%3dNN%5d%29%20Anspruch&ql=poliqarp"
),
w = c(1),
leftContextSize = c(0),
rightContextSize = c(1),
N = c(23578528381.5),
O = c(0.5),
O1 = c(17077.5),
O2 = c(1296870.5),
E = c(0.939299756346416),
pmi = c(7.99469408391783),
mi2 = c(15.8990457079122),
mi3 = c(23.8033973319065),
logDice = c(2.57887487309409),
ll = c(2181.35986032019)
)
merged <- mergeDuplicateCollocates(ldf, rdf, smoothingConstant = 0.5)
expect_equal(merged$O, 0.5)
expect_equal(merged$O1, 1185487.5)
expect_equal(merged$O2, 1296870.5)
expect_equal(merged$query, "Anspruch focus(in [tt/p=NN] {[tt/l=nehmen]}) | focus({[tt/l=nehmen] in} [tt/p=NN]) Anspruch")
})
test_that("add_multi_vc_comparisons adds favorite columns", {
sample_result <- tibble::tibble(
node = c("n", "n"),
collocate = c("c", "c"),
vc = c("vc1", "vc2"),
label = c("A", "B"),
N = c(100, 100),
O = c(10, 20),
O1 = c(50, 50),
O2 = c(30, 30),
E = c(5, 5),
w = c(2, 2),
leftContextSize = c(1, 1),
rightContextSize = c(1, 1),
frequency = c(10, 20),
logDice = c(5, 7),
pmi = c(2, 3)
)
enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result)
expect_true(all(c(
"winner_logDice",
"winner_logDice_value",
"runner_up_logDice",
"runner_up_logDice_value",
"max_delta_logDice",
"winner_rank_logDice",
"winner_rank_logDice_value",
"runner_up_rank_logDice",
"runner_up_rank_logDice_value",
"loser_rank_logDice",
"loser_rank_logDice_value",
"max_delta_rank_logDice",
"winner_percentile_rank_logDice",
"winner_percentile_rank_logDice_value",
"runner_up_percentile_rank_logDice",
"runner_up_percentile_rank_logDice_value",
"loser_percentile_rank_logDice",
"loser_percentile_rank_logDice_value",
"max_delta_percentile_rank_logDice",
"winner_rank_pmi",
"winner_rank_pmi_value",
"runner_up_rank_pmi",
"runner_up_rank_pmi_value",
"loser_rank_pmi",
"loser_rank_pmi_value",
"max_delta_rank_pmi",
"winner_percentile_rank_pmi",
"winner_percentile_rank_pmi_value",
"runner_up_percentile_rank_pmi",
"runner_up_percentile_rank_pmi_value",
"loser_percentile_rank_pmi",
"loser_percentile_rank_pmi_value",
"max_delta_percentile_rank_pmi",
"rank_A_logDice",
"rank_B_logDice",
"rank_A_pmi",
"rank_B_pmi",
"percentile_rank_A_logDice",
"percentile_rank_B_logDice",
"percentile_rank_A_pmi",
"percentile_rank_B_pmi",
"delta_rank_logDice",
"delta_rank_pmi",
"delta_percentile_rank_logDice",
"delta_percentile_rank_pmi"
) %in% colnames(enriched)))
expect_true(all(enriched$winner_logDice == "B"))
expect_true(all(enriched$runner_up_logDice == "A"))
expect_true(all(enriched$winner_logDice_value >= enriched$runner_up_logDice_value))
expect_true(all(enriched$percentile_rank_A_logDice == 1))
expect_true(all(enriched$percentile_rank_B_logDice == 1))
expect_true(all(enriched$delta_percentile_rank_logDice == 0))
})
test_that("add_multi_vc_comparisons handles more than two labels", {
sample_result <- tibble::tibble(
node = rep("n", 3),
collocate = rep("c", 3),
vc = c("vc1", "vc2", "vc3"),
label = c("A", "B", "C"),
N = rep(100, 3),
O = c(10, 30, 5),
O1 = rep(50, 3),
O2 = rep(30, 3),
E = rep(5, 3),
w = rep(2, 3),
leftContextSize = rep(1, 3),
rightContextSize = rep(1, 3),
frequency = c(10, 30, 5),
logDice = c(5, 8, 4),
pmi = c(2, 3, 1)
)
enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result)
expect_equal(enriched$winner_logDice[1], "B")
expect_equal(enriched$winner_logDice_value[1], 8)
expect_equal(enriched$runner_up_logDice[1], "A")
expect_equal(enriched$runner_up_logDice_value[1], 5)
expect_equal(enriched$loser_logDice[1], "C")
expect_equal(enriched$loser_logDice_value[1], 4)
expect_equal(enriched$max_delta_logDice[1], 4)
})
test_that("add_multi_vc_comparisons computes rank deltas", {
base_tbl <- tidyr::expand_grid(
label = c("A", "B", "C"),
collocate = c("c1", "c2", "c3")
) |>
dplyr::mutate(
node = "n",
vc = paste0("vc", label),
N = 100,
O = 10,
O1 = 50,
O2 = 40,
E = 5,
w = 2,
leftContextSize = 1,
rightContextSize = 1,
frequency = 10,
logDice = dplyr::case_when(
label == "A" & collocate == "c1" ~ 9,
label == "A" & collocate == "c2" ~ 6,
label == "A" & collocate == "c3" ~ 3,
label == "B" & collocate == "c1" ~ 7,
label == "B" & collocate == "c2" ~ 9,
label == "B" & collocate == "c3" ~ 5,
label == "C" & collocate == "c1" ~ 4,
label == "C" & collocate == "c2" ~ 6,
label == "C" & collocate == "c3" ~ 8,
TRUE ~ 0
)
)
enriched <- RKorAPClient:::add_multi_vc_comparisons(base_tbl)
target_row <- enriched |>
dplyr::filter(collocate == "c1") |>
dplyr::slice_head(n = 1)
expect_equal(target_row$rank_A_logDice, 1)
expect_equal(target_row$rank_B_logDice, 2)
expect_equal(target_row$rank_C_logDice, 3)
expect_equal(target_row$winner_rank_logDice, "A")
expect_equal(target_row$winner_rank_logDice_value, 1)
expect_equal(target_row$runner_up_rank_logDice, "B")
expect_equal(target_row$runner_up_rank_logDice_value, 2)
expect_equal(target_row$loser_rank_logDice, "C")
expect_equal(target_row$loser_rank_logDice_value, 3)
expect_equal(target_row$max_delta_rank_logDice, 2)
})
test_that("add_multi_vc_comparisons imputes missing ranks for max delta", {
sample_result <- tibble::tibble(
node = c("n", "n"),
collocate = c("c", "c"),
vc = c("vc1", "vc2"),
label = c("A", "B"),
N = c(100, 100),
O = c(10, 10),
O1 = c(50, 50),
O2 = c(30, 30),
E = c(5, 5),
w = c(2, 2),
leftContextSize = c(1, 1),
rightContextSize = c(1, 1),
frequency = c(10, 10),
logDice = c(5, NA)
)
enriched <- RKorAPClient:::add_multi_vc_comparisons(sample_result)
expect_equal(enriched$rank_A_logDice[1], 1)
expect_true(is.na(enriched$rank_B_logDice[1]))
expect_equal(enriched$winner_rank_logDice[1], "A")
expect_equal(enriched$loser_rank_logDice[1], "B")
expect_equal(enriched$loser_rank_logDice_value[1], 2)
expect_equal(enriched$max_delta_rank_logDice[1], 1)
})
test_that("adaptive missing score imputation respects measure-specific scales", {
sample_result <- tibble::tibble(
node = c("n", "n", "n"),
collocate = c("c", "c", "c"),
vc = c("vc1", "vc2", "vc3"),
label = c("A", "B", "C"),
N = c(100, 100, 100),
O = c(12, 9, 7),
O1 = c(60, 40, 30),
O2 = c(33, 22, 18),
E = c(6, 6, 6),
w = c(2, 2, 2),
leftContextSize = c(1, 1, 1),
rightContextSize = c(1, 1, 1),
frequency = c(15, 11, 9),
logDice = c(-0.31, NA, -0.12),
pmi = c(-1.65, NA, -0.48),
ll = c(12.4, NA, 7.9)
)
enriched <- RKorAPClient:::add_multi_vc_comparisons(
sample_result,
missingScoreQuantile = 0.05
)
row_a <- dplyr::filter(enriched, label == "A") |> dplyr::slice_head(n = 1)
expect_false(is.na(row_a$logDice_B))
expect_false(is.na(row_a$pmi_B))
expect_false(is.na(row_a$ll_B))
expect_lt(row_a$logDice_B, min(sample_result$logDice, na.rm = TRUE))
expect_lt(row_a$pmi_B, min(sample_result$pmi, na.rm = TRUE))
expect_lte(row_a$ll_B, min(sample_result$ll, na.rm = TRUE))
expect_gt(row_a$max_delta_logDice, 0)
expect_gt(row_a$winner_logDice_value - row_a$loser_logDice_value, 0)
})
# New tests for improved coverage of collocationAnalysis.R helper functions
test_that("synsemanticStopwords returns German stopwords", {
stopwords <- synsemanticStopwords()
expect_true(is.character(stopwords))
expect_true(length(stopwords) > 50)
expect_true("der" %in% stopwords)
expect_true("die" %in% stopwords)
expect_true("und" %in% stopwords)
expect_true("mit" %in% stopwords)
})
test_that("removeWithinSpan removes span constraints correctly", {
# Test basic span removal
query1 <- "contains(<base/s=s>, (machen []{0,1} aufmerksam | aufmerksam []{0,1} machen))"
result1 <- RKorAPClient:::removeWithinSpan(query1, "base/s=s")
expect_equal(result1, "(machen []{0,1} aufmerksam | aufmerksam []{0,1} machen)")
# Test with different span
query2 <- "contains(<p/s=s>, (test query))"
result2 <- RKorAPClient:::removeWithinSpan(query2, "p/s=s")
expect_equal(result2, "(test query)")
# Test with empty span - should return original query
query3 <- "simple query"
result3 <- RKorAPClient:::removeWithinSpan(query3, "")
expect_equal(result3, query3)
# Test with non-matching span
query4 <- "contains(<base/s=s>, test)"
result4 <- RKorAPClient:::removeWithinSpan(query4, "other/span")
expect_equal(result4, query4)
})
test_that("matches2FreqTable handles empty matches", {
empty_matches <- data.frame()
result <- RKorAPClient:::matches2FreqTable(empty_matches, index = 0)
expect_true(is.data.frame(result))
expect_equal(nrow(result), 0)
})
test_that("matches2FreqTable processes single match correctly", {
# Create mock matches data
mock_matches <- data.frame(
tokens = I(list(list(
left = c("der", "große"),
match = "Test",
right = c("ist", "wichtig")
))),
stringsAsFactors = FALSE
)
result <- RKorAPClient:::matches2FreqTable(
mock_matches,
index = 1,
leftContextSize = 2,
rightContextSize = 2,
stopwords = c("der", "ist") # Provide stopwords to avoid empty join
)
expect_true(is.data.frame(result))
})
test_that("snippet2FreqTable handles empty snippet", {
result <- RKorAPClient:::snippet2FreqTable(character(0))
expect_true(is.data.frame(result))
expect_equal(nrow(result), 0)
})
test_that("snippet2FreqTable processes single snippet correctly", {
snippet <- '<span class="context-left">der große </span><span class="match"><mark>Test</mark></span><span class="context-right"> ist wichtig</span>'
result <- RKorAPClient:::snippet2FreqTable(
snippet,
leftContextSize = 2,
rightContextSize = 2,
stopwords = c("der"), # Provide stopwords to avoid empty join
verbose = FALSE
)
expect_true(is.data.frame(result))
})
test_that("inject_focus_into_query adds focus wrappers when span queries lack them", {
unfocused <- "contains(<base/s=s>, (Anspruch [tt/l=nehmen] | [tt/l=nehmen] Anspruch))"
focused <- RKorAPClient:::inject_focus_into_query(unfocused)
expect_equal(
focused,
"contains(<base/s=s>, (focus({Anspruch [tt/l=nehmen]}) | focus({[tt/l=nehmen] Anspruch})))"
)
})
test_that("inject_focus_into_query leaves existing focus segments unchanged", {
already_focused <- "contains(<base/s=s>, (focus({Anspruch [tt/l=nehmen]})))"
expect_identical(
RKorAPClient:::inject_focus_into_query(already_focused),
already_focused
)
})
# Removed hanging findExample tests as they cause infinite wait
# These tests make API calls that don't complete properly
# Removed hanging collocatesQuery tests as they cause infinite wait
# These tests were causing the test suite to hang and not terminate
test_that("collocationAnalysis handles exactFrequencies parameter", {
skip_if_offline()
kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)
expect_warning(
result <- collocationAnalysis(
kco,
"Test",
exactFrequencies = TRUE,
searchHitsSampleLimit = 5,
topCollocatesLimit = 5
),
"access token"
)
expect_true(is.data.frame(result))
})
test_that("collocationAnalysis handles withinSpan parameter", {
skip_if_offline()
kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)
expect_warning(
result <- collocationAnalysis(
kco,
"Test",
withinSpan = "base/s=s",
exactFrequencies = TRUE,
searchHitsSampleLimit = 5,
topCollocatesLimit = 5
),
"access token"
)
expect_true(is.data.frame(result))
})
test_that("collocationAnalysis handles expand parameter", {
skip_if_offline()
kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)
expect_warning(
result <- collocationAnalysis(
kco,
c("Test", "der"),
expand = TRUE,
searchHitsSampleLimit = 2,
topCollocatesLimit = 2
),
"access token"
)
expect_true(is.data.frame(result))
})
test_that("collocationAnalysis honors named vc labels", {
skip_if_offline()
kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)
named_vc <- c(
Western = "textType=/.*Western.*/ & pubDate in 2012",
Erotic = "textType=/.*(Erotik|Gay).*/ & pubDate in 2012",
Historic = "textType=/.*Historisch.*/ & pubDate in 2012"
)
expect_warning(
result <- collocationAnalysis(
kco,
"[tt/l=treffen]",
vc = named_vc,
searchHitsSampleLimit = 2,
topCollocatesLimit = 2
),
"access token"
)
if (nrow(result) > 0) {
expect_true("label" %in% colnames(result))
expect_setequal(unique(result$label), names(named_vc))
}
})
test_that("collocationAnalysis handles stopwords parameter", {
skip_if_offline()
kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)
expect_warning(
result <- collocationAnalysis(
kco,
"Test",
stopwords = c("der", "die", "und"),
searchHitsSampleLimit = 5,
topCollocatesLimit = 5
),
"access token"
)
expect_true(is.data.frame(result))
})
test_that("collocationAnalysis handles lemmatizeNodeQuery parameter", {
skip_if_offline()
kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)
expect_warning(
result <- collocationAnalysis(
kco,
"laufen",
lemmatizeNodeQuery = TRUE,
searchHitsSampleLimit = 5,
topCollocatesLimit = 5
),
"access token"
)
expect_true(is.data.frame(result))
})
test_that("collocationAnalysis handles addExamples parameter", {
skip_if_offline()
kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)
expect_warning(
result <- collocationAnalysis(
kco,
"Test",
addExamples = TRUE,
searchHitsSampleLimit = 3,
topCollocatesLimit = 3
),
"access token"
)
expect_true(is.data.frame(result))
if (nrow(result) > 0) {
expect_true("example" %in% colnames(result))
}
})
test_that("collocationAnalysis handles maxRecurse parameter", {
skip_if_offline()
kco <- KorAPConnection(accessToken = NULL, cache = TRUE, verbose = FALSE)
expect_warning(
result <- collocationAnalysis(
kco,
"Test",
maxRecurse = 1,
searchHitsSampleLimit = 2,
topCollocatesLimit = 2
),
"access token"
)
expect_true(is.data.frame(result))
})