Keep row order constant in ci function Change-Id: I46f6509a1f4b3defb62ef8c42653f6e1f76e10be

commit: 319e74659c66d809cf6081c549a8290b6562c8f4 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Wed Jun 04 17:14:03 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Wed Jun 04 17:14:03 2025 +0200
tree: cddd3cfed1784f1ec500d207e4130cf937508670
parent: 3b3e7cd32955a7bfd0dae03a340620d0b916309e [diff]
diff --git a/tests/testthat/test-ci.R b/tests/testthat/test-ci.R
new file mode 100644
index 0000000..c980470
--- /dev/null
+++ b/tests/testthat/test-ci.R

@@ -0,0 +1,303 @@
+test_that("ci function works with basic input", {
+  # Create a simple test data frame
+  df <- data.frame(
+    totalResults = c(100, 200, 50),
+    total = c(1000, 2000, 500),
+    query = c("test1", "test2", "test3")
+  )
+
+  result <- ci(df)
+
+  expect_s3_class(result, "data.frame")
+  expect_true("f" %in% names(result))
+  expect_true("conf.low" %in% names(result))
+  expect_true("conf.high" %in% names(result))
+  expect_equal(nrow(result), 3)
+
+  # Check that relative frequencies are calculated correctly
+  expect_equal(result$f[1], 0.1, tolerance = 0.001)
+  expect_equal(result$f[2], 0.1, tolerance = 0.001)
+  expect_equal(result$f[3], 0.1, tolerance = 0.001)
+})
+
+test_that("ci function handles custom column names", {
+  # Test with custom column names
+  df <- data.frame(
+    observed = c(50, 100),
+    N_total = c(500, 1000),
+    condition = c("A", "B")
+  )
+
+  result <- ci(df, x = observed, N = N_total)
+
+  expect_s3_class(result, "data.frame")
+  expect_true("f" %in% names(result))
+  expect_true("conf.low" %in% names(result))
+  expect_true("conf.high" %in% names(result))
+  expect_equal(nrow(result), 2)
+  expect_equal(result$f[1], 0.1, tolerance = 0.001)
+  expect_equal(result$f[2], 0.1, tolerance = 0.001)
+})
+
+test_that("ci function handles different confidence levels", {
+  df <- data.frame(
+    totalResults = c(100),
+    total = c(1000)
+  )
+
+  # Test 90% confidence level
+  result_90 <- ci(df, conf.level = 0.90)
+  expect_s3_class(result_90, "data.frame")
+  expect_true("f" %in% names(result_90))
+  expect_true("conf.low" %in% names(result_90))
+  expect_true("conf.high" %in% names(result_90))
+
+  # Test 99% confidence level
+  result_99 <- ci(df, conf.level = 0.99)
+  expect_s3_class(result_99, "data.frame")
+
+  # 99% CI should be wider than 90% CI
+  ci_width_90 <- result_90$conf.high[1] - result_90$conf.low[1]
+  ci_width_99 <- result_99$conf.high[1] - result_99$conf.low[1]
+  expect_true(ci_width_99 > ci_width_90)
+})
+
+test_that("ci function handles zero and negative totals", {
+  df <- data.frame(
+    totalResults = c(10, 20, 30),
+    total = c(100, 0, -10)
+  )
+
+  result <- ci(df)
+
+  expect_s3_class(result, "data.frame")
+  expect_equal(nrow(result), 3)
+
+  # First row should have valid values
+  expect_false(is.na(result$f[1]))
+  expect_false(is.na(result$conf.low[1]))
+  expect_false(is.na(result$conf.high[1]))
+
+  # Rows with zero or negative totals should have NA values
+  expect_true(is.na(result$f[2]))
+  expect_true(is.na(result$conf.low[2]))
+  expect_true(is.na(result$conf.high[2]))
+  expect_true(is.na(result$f[3]))
+  expect_true(is.na(result$conf.low[3]))
+  expect_true(is.na(result$conf.high[3]))
+})
+
+test_that("ci function handles NA values in totals", {
+  df <- data.frame(
+    totalResults = c(10, 20, 30),
+    total = c(100, NA, 300)
+  )
+
+  result <- ci(df)
+
+  expect_s3_class(result, "data.frame")
+  expect_equal(nrow(result), 3)
+
+  # First and third rows should have valid values
+  expect_false(is.na(result$f[1]))
+  expect_false(is.na(result$f[3]))
+
+  # Second row (with NA total) should have NA values
+  expect_true(is.na(result$f[2]))
+  expect_true(is.na(result$conf.low[2]))
+  expect_true(is.na(result$conf.high[2]))
+})
+
+test_that("ci function handles edge cases with very small frequencies", {
+  df <- data.frame(
+    totalResults = c(1, 0),
+    total = c(1000000, 1000000)
+  )
+
+  result <- ci(df)
+
+  expect_s3_class(result, "data.frame")
+  expect_equal(nrow(result), 2)
+
+  # Check that very small frequencies are handled correctly
+  expect_true(result$f[1] > 0)
+  expect_true(result$f[1] < 0.01)
+  expect_equal(result$f[2], 0)
+})
+
+test_that("ci function handles large numbers correctly", {
+  df <- data.frame(
+    totalResults = c(1000000),
+    total = c(10000000)
+  )
+
+  result <- ci(df)
+
+  expect_s3_class(result, "data.frame")
+  expect_equal(nrow(result), 1)
+  expect_equal(result$f[1], 0.1, tolerance = 0.001)
+  expect_true(result$conf.low[1] > 0)
+  expect_true(result$conf.high[1] < 1)
+})
+
+test_that("ci function preserves original columns", {
+  df <- data.frame(
+    totalResults = c(100, 200),
+    total = c(1000, 2000),
+    query = c("test1", "test2"),
+    condition = c("A", "B"),
+    year = c(2020, 2021)
+  )
+
+  result <- ci(df)
+
+  expect_s3_class(result, "data.frame")
+  expect_true("query" %in% names(result))
+  expect_true("condition" %in% names(result))
+  expect_true("year" %in% names(result))
+  expect_true("totalResults" %in% names(result))
+  expect_true("total" %in% names(result))
+
+  # Check that original values are preserved
+  expect_equal(result$query, c("test1", "test2"))
+  expect_equal(result$condition, c("A", "B"))
+  expect_equal(result$year, c(2020, 2021))
+})
+
+test_that("ci function handles empty data frame", {
+  df <- data.frame(
+    totalResults = numeric(0),
+    total = numeric(0)
+  )
+
+  result <- ci(df)
+
+  expect_s3_class(result, "data.frame")
+  expect_equal(nrow(result), 0)
+  expect_true("f" %in% names(result))
+  expect_true("conf.low" %in% names(result))
+  expect_true("conf.high" %in% names(result))
+})
+
+test_that("ci function handles all zero totals", {
+  df <- data.frame(
+    totalResults = c(10, 20, 30),
+    total = c(0, 0, 0)
+  )
+
+  result <- ci(df)
+
+  expect_s3_class(result, "data.frame")
+  expect_equal(nrow(result), 3)
+
+  # All rows should have NA values
+  expect_true(all(is.na(result$f)))
+  expect_true(all(is.na(result$conf.low)))
+  expect_true(all(is.na(result$conf.high)))
+})
+
+test_that("ci function validates confidence level parameter", {
+  df <- data.frame(
+    totalResults = c(100),
+    total = c(1000)
+  )
+
+  # Test invalid confidence levels
+  expect_error(ci(df, conf.level = 1.1))
+  expect_error(ci(df, conf.level = 0))
+  expect_error(ci(df, conf.level = -0.1))
+})
+
+test_that("ci function handles tibble input", {
+  if (requireNamespace("tibble", quietly = TRUE)) {
+    df <- tibble::tibble(
+      totalResults = c(100, 200),
+      total = c(1000, 2000),
+      query = c("test1", "test2")
+    )
+
+    result <- ci(df)
+
+    expect_s3_class(result, "tbl_df")
+    expect_true("f" %in% names(result))
+    expect_true("conf.low" %in% names(result))
+    expect_true("conf.high" %in% names(result))
+    expect_equal(nrow(result), 2)
+  }
+})
+
+test_that("ci function confidence intervals are reasonable", {
+  # Test with a known case
+  df <- data.frame(
+    totalResults = c(50),  # 50 out of 100 = 50%
+    total = c(100)
+  )
+
+  result <- ci(df, conf.level = 0.95)
+
+  expect_s3_class(result, "data.frame")
+  expect_equal(result$f[1], 0.5, tolerance = 0.001)
+
+  # For 50% with n=100, 95% CI should be roughly symmetric around 0.5
+  expect_true(result$conf.low[1] < 0.5)
+  expect_true(result$conf.high[1] > 0.5)
+
+  # CI should be reasonable width (not too narrow or too wide)
+  ci_width <- result$conf.high[1] - result$conf.low[1]
+  expect_true(ci_width > 0.05)  # Not too narrow
+  expect_true(ci_width < 0.5)   # Not too wide
+})
+
+test_that("ci function works with mixed valid and invalid data", {
+  df <- data.frame(
+    totalResults = c(100, 200, 50, 75),
+    total = c(1000, 0, NA, 500),
+    condition = c("A", "B", "C", "D")
+  )
+
+  result <- ci(df)
+
+  expect_s3_class(result, "data.frame")
+  expect_equal(nrow(result), 4)
+
+  # First and fourth rows should have valid values
+  expect_false(is.na(result$f[1]))
+  expect_false(is.na(result$f[4]))
+
+  # Second and third rows should have NA values
+  expect_true(is.na(result$f[2]))
+  expect_true(is.na(result$f[3]))
+
+  # Check that valid calculations are correct
+  expect_equal(result$f[1], 0.1, tolerance = 0.001)
+  expect_equal(result$f[4], 0.15, tolerance = 0.001)
+})
+
+test_that("ci function preserves row order with mixed valid/invalid data", {
+  # Test data with alternating valid and invalid rows
+  df <- data.frame(
+    totalResults = c(100, 0, 200, NA, 50),
+    total = c(1000, 0, 2000, 1500, 500),
+    query = c("first", "second", "third", "fourth", "fifth"),
+    stringsAsFactors = FALSE
+  )
+
+  result <- ci(df)
+
+  # Check that the order is preserved
+  expect_equal(result$query, c("first", "second", "third", "fourth", "fifth"))
+
+  # Check that valid rows have computed values
+  expect_false(is.na(result$f[1]))  # first row should have valid f
+  expect_false(is.na(result$f[3]))  # third row should have valid f
+  expect_false(is.na(result$f[5]))  # fifth row should have valid f
+
+  # Check that invalid rows have NA values
+  expect_true(is.na(result$f[2]))   # second row (total = 0)
+  expect_true(is.na(result$f[4]))   # fourth row (total = NA)
+
+  expect_true(is.na(result$conf.low[2]))
+  expect_true(is.na(result$conf.high[2]))
+  expect_true(is.na(result$conf.low[4]))
+  expect_true(is.na(result$conf.high[4]))
+})
commit	319e74659c66d809cf6081c549a8290b6562c8f4	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Jun 04 17:14:03 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Jun 04 17:14:03 2025 +0200
tree	cddd3cfed1784f1ec500d207e4130cf937508670
parent	3b3e7cd32955a7bfd0dae03a340620d0b916309e [diff]