blob: 032b6779a3dc3719e7338597c69971dbacefc153 [file] [log] [blame]
Marc Kupietzdcc1de62019-10-04 09:10:36 +02001#' Add confidence interval and relative frequency variables
2#'
Marc Kupietz67edcb52021-09-20 21:54:24 +02003#' Using [prop.test()], `ci` adds three columns to a data frame:
4#' 1. relative frequency (`f`)
5#' 2. lower bound of a confidence interval (`ci.low`)
Marc Kupietz3f575282019-10-04 14:46:04 +02006#' 3. upper bound of a confidence interval
7#'
Marc Kupietz97a1bca2019-10-04 22:52:09 +02008#'
Marc Kupietz3f575282019-10-04 14:46:04 +02009#' @seealso
Marc Kupietz67edcb52021-09-20 21:54:24 +020010#' `ci` is already included in [frequencyQuery()]
Marc Kupietzdcc1de62019-10-04 09:10:36 +020011#'
12#' @param df table with columns for absolute and total frequencies.
13#' @param x column with the observed absolute frequency.
Marc Kupietz43a6ade2020-02-18 17:01:44 +010014#' @param N column with the total frequencies
Marc Kupietzdcc1de62019-10-04 09:10:36 +020015#' @param conf.level confidence level of the returned confidence interval. Must
16#' be a single number between 0 and 1.
17#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010018#' @rdname misc-functions
19#'
Marc Kupietzdcc1de62019-10-04 09:10:36 +020020#' @export
21#' @importFrom stats prop.test
Marc Kupietz3f575282019-10-04 14:46:04 +020022#' @importFrom tibble remove_rownames
Marc Kupietz319e7462025-06-04 17:14:03 +020023#' @importFrom dplyr enquo rename starts_with filter mutate rowwise bind_rows select arrange row_number quo_name
24#' @importFrom broom tidy
25#' @importFrom tidyr unnest
Marc Kupietzdcc1de62019-10-04 09:10:36 +020026#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020027#' \dontrun{
28#'
Marc Kupietzdcc1de62019-10-04 09:10:36 +020029#' library(ggplot2)
Marc Kupietz617266d2025-02-27 10:43:07 +010030#' kco <- KorAPConnection(verbose=TRUE)
Marc Kupietzdcc1de62019-10-04 09:10:36 +020031#' expand_grid(year=2015:2018, alternatives=c("Hate Speech", "Hatespeech")) %>%
32#' bind_cols(corpusQuery(kco, .$alternatives, sprintf("pubDate in %d", .$year))) %>%
Marc Kupietz71d6e052019-11-22 18:42:10 +010033#' mutate(total=corpusStats(kco, vc=vc)$tokens) %>%
Marc Kupietzdcc1de62019-10-04 09:10:36 +020034#' ci() %>%
35#' ggplot(aes(x=year, y=f, fill=query, color=query, ymin=conf.low, ymax=conf.high)) +
36#' geom_point() + geom_line() + geom_ribbon(alpha=.3)
Marc Kupietz05b22772020-02-18 21:58:42 +010037#' }
Marc Kupietz53c1b502020-02-03 22:48:30 +010038ci <- function(df,
39 x = totalResults,
40 N = total,
41 conf.level = 0.95) {
Marc Kupietzdcc1de62019-10-04 09:10:36 +020042 x <- enquo(x)
43 N <- enquo(N)
Marc Kupietz319e7462025-06-04 17:14:03 +020044
45 # Add row index to preserve original order
46 df <- df %>% mutate(.row_index = row_number())
47
48 # Initialize result with all NA values
49 result <- df %>%
50 mutate(f = NA_real_, conf.low = NA_real_, conf.high = NA_real_)
51
52 # Calculate confidence intervals for valid rows
53 # Use the column names from the enquoted expressions
54 N_col <- quo_name(N)
55 x_col <- quo_name(x)
56 valid_indices <- which(df[[N_col]] > 0 & !is.na(df[[N_col]]) & !is.na(df[[x_col]]))
57
58 if (length(valid_indices) > 0) {
59 valid_data <- df[valid_indices, ]
60
61 ci_results <- valid_data %>%
62 rowwise %>%
63 mutate(tst = list(
64 broom::tidy(prop.test(!!x, !!N, conf.level = conf.level)) %>%
65 select(estimate, conf.low, conf.high) %>%
66 rename(f = estimate)
67 )) %>%
68 tidyr::unnest(tst) %>%
69 select(.row_index, f, conf.low, conf.high)
70
71 # Update result with calculated values
72 for (i in seq_len(nrow(ci_results))) {
73 row_idx <- ci_results$.row_index[i]
74 result$f[row_idx] <- ci_results$f[i]
75 result$conf.low[row_idx] <- ci_results$conf.low[i]
76 result$conf.high[row_idx] <- ci_results$conf.high[i]
77 }
78 }
79
80 # Remove the helper column
81 result %>% select(-.row_index)
Marc Kupietzdcc1de62019-10-04 09:10:36 +020082}
83
Marc Kupietz7d613872019-10-04 22:47:20 +020084## Mute notes: "Undefined global functions or variables:"
Marc Kupietz319e7462025-06-04 17:14:03 +020085globalVariables(c("totalResults", "total", "estimate", "tst", ".row_index", "f", "conf.low", "conf.high", "N_col", "x_col"))
Marc Kupietzdcc1de62019-10-04 09:10:36 +020086
87
Marc Kupietz71d6e052019-11-22 18:42:10 +010088# ci.old <- function(df, x = totalResults, N = total, conf.level = 0.95) {
Marc Kupietzdcc1de62019-10-04 09:10:36 +020089# x <- deparse(substitute(x))
90# N <- deparse(substitute(N))
91# df <- data.frame(df)
92# df$f <- df[,x] / df[,N]
93# df[, c("conf.low", "conf.high")] <- t(sapply(Map(function(a, b) prop.test(a, b, conf.level = conf.level), df[,x], df[,N]), "[[","conf.int"))
94# return(df)
95# }