blob: 032b6779a3dc3719e7338597c69971dbacefc153 [file] [log] [blame]
#' Add confidence interval and relative frequency variables
#'
#' Using [prop.test()], `ci` adds three columns to a data frame:
#' 1. relative frequency (`f`)
#' 2. lower bound of a confidence interval (`ci.low`)
#' 3. upper bound of a confidence interval
#'
#'
#' @seealso
#' `ci` is already included in [frequencyQuery()]
#'
#' @param df table with columns for absolute and total frequencies.
#' @param x column with the observed absolute frequency.
#' @param N column with the total frequencies
#' @param conf.level confidence level of the returned confidence interval. Must
#' be a single number between 0 and 1.
#'
#' @rdname misc-functions
#'
#' @export
#' @importFrom stats prop.test
#' @importFrom tibble remove_rownames
#' @importFrom dplyr enquo rename starts_with filter mutate rowwise bind_rows select arrange row_number quo_name
#' @importFrom broom tidy
#' @importFrom tidyr unnest
#' @examples
#' \dontrun{
#'
#' library(ggplot2)
#' kco <- KorAPConnection(verbose=TRUE)
#' expand_grid(year=2015:2018, alternatives=c("Hate Speech", "Hatespeech")) %>%
#' bind_cols(corpusQuery(kco, .$alternatives, sprintf("pubDate in %d", .$year))) %>%
#' mutate(total=corpusStats(kco, vc=vc)$tokens) %>%
#' ci() %>%
#' ggplot(aes(x=year, y=f, fill=query, color=query, ymin=conf.low, ymax=conf.high)) +
#' geom_point() + geom_line() + geom_ribbon(alpha=.3)
#' }
ci <- function(df,
x = totalResults,
N = total,
conf.level = 0.95) {
x <- enquo(x)
N <- enquo(N)
# Add row index to preserve original order
df <- df %>% mutate(.row_index = row_number())
# Initialize result with all NA values
result <- df %>%
mutate(f = NA_real_, conf.low = NA_real_, conf.high = NA_real_)
# Calculate confidence intervals for valid rows
# Use the column names from the enquoted expressions
N_col <- quo_name(N)
x_col <- quo_name(x)
valid_indices <- which(df[[N_col]] > 0 & !is.na(df[[N_col]]) & !is.na(df[[x_col]]))
if (length(valid_indices) > 0) {
valid_data <- df[valid_indices, ]
ci_results <- valid_data %>%
rowwise %>%
mutate(tst = list(
broom::tidy(prop.test(!!x, !!N, conf.level = conf.level)) %>%
select(estimate, conf.low, conf.high) %>%
rename(f = estimate)
)) %>%
tidyr::unnest(tst) %>%
select(.row_index, f, conf.low, conf.high)
# Update result with calculated values
for (i in seq_len(nrow(ci_results))) {
row_idx <- ci_results$.row_index[i]
result$f[row_idx] <- ci_results$f[i]
result$conf.low[row_idx] <- ci_results$conf.low[i]
result$conf.high[row_idx] <- ci_results$conf.high[i]
}
}
# Remove the helper column
result %>% select(-.row_index)
}
## Mute notes: "Undefined global functions or variables:"
globalVariables(c("totalResults", "total", "estimate", "tst", ".row_index", "f", "conf.low", "conf.high", "N_col", "x_col"))
# ci.old <- function(df, x = totalResults, N = total, conf.level = 0.95) {
# x <- deparse(substitute(x))
# N <- deparse(substitute(N))
# df <- data.frame(df)
# df$f <- df[,x] / df[,N]
# df[, c("conf.low", "conf.high")] <- t(sapply(Map(function(a, b) prop.test(a, b, conf.level = conf.level), df[,x], df[,N]), "[[","conf.int"))
# return(df)
# }