Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 1 | #' Add confidence interval and relative frequency variables |
| 2 | #' |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 3 | #' Using [prop.test()], `ci` adds three columns to a data frame: |
| 4 | #' 1. relative frequency (`f`) |
| 5 | #' 2. lower bound of a confidence interval (`ci.low`) |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 6 | #' 3. upper bound of a confidence interval |
| 7 | #' |
Marc Kupietz | 97a1bca | 2019-10-04 22:52:09 +0200 | [diff] [blame] | 8 | #' |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 9 | #' @seealso |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 10 | #' `ci` is already included in [frequencyQuery()] |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 11 | #' |
| 12 | #' @param df table with columns for absolute and total frequencies. |
| 13 | #' @param x column with the observed absolute frequency. |
Marc Kupietz | 43a6ade | 2020-02-18 17:01:44 +0100 | [diff] [blame] | 14 | #' @param N column with the total frequencies |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 15 | #' @param conf.level confidence level of the returned confidence interval. Must |
| 16 | #' be a single number between 0 and 1. |
| 17 | #' |
Marc Kupietz | a6e4ee6 | 2021-03-05 09:00:15 +0100 | [diff] [blame] | 18 | #' @rdname misc-functions |
| 19 | #' |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 20 | #' @export |
| 21 | #' @importFrom stats prop.test |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 22 | #' @importFrom tibble remove_rownames |
Marc Kupietz | 319e746 | 2025-06-04 17:14:03 +0200 | [diff] [blame] | 23 | #' @importFrom dplyr enquo rename starts_with filter mutate rowwise bind_rows select arrange row_number quo_name |
| 24 | #' @importFrom broom tidy |
| 25 | #' @importFrom tidyr unnest |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 26 | #' @examples |
Marc Kupietz | 6ae7605 | 2021-09-21 10:34:00 +0200 | [diff] [blame] | 27 | #' \dontrun{ |
| 28 | #' |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 29 | #' library(ggplot2) |
Marc Kupietz | 617266d | 2025-02-27 10:43:07 +0100 | [diff] [blame] | 30 | #' kco <- KorAPConnection(verbose=TRUE) |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 31 | #' expand_grid(year=2015:2018, alternatives=c("Hate Speech", "Hatespeech")) %>% |
| 32 | #' bind_cols(corpusQuery(kco, .$alternatives, sprintf("pubDate in %d", .$year))) %>% |
Marc Kupietz | 71d6e05 | 2019-11-22 18:42:10 +0100 | [diff] [blame] | 33 | #' mutate(total=corpusStats(kco, vc=vc)$tokens) %>% |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 34 | #' ci() %>% |
| 35 | #' ggplot(aes(x=year, y=f, fill=query, color=query, ymin=conf.low, ymax=conf.high)) + |
| 36 | #' geom_point() + geom_line() + geom_ribbon(alpha=.3) |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 37 | #' } |
Marc Kupietz | 53c1b50 | 2020-02-03 22:48:30 +0100 | [diff] [blame] | 38 | ci <- function(df, |
| 39 | x = totalResults, |
| 40 | N = total, |
| 41 | conf.level = 0.95) { |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 42 | x <- enquo(x) |
| 43 | N <- enquo(N) |
Marc Kupietz | 319e746 | 2025-06-04 17:14:03 +0200 | [diff] [blame] | 44 | |
| 45 | # Add row index to preserve original order |
| 46 | df <- df %>% mutate(.row_index = row_number()) |
| 47 | |
| 48 | # Initialize result with all NA values |
| 49 | result <- df %>% |
| 50 | mutate(f = NA_real_, conf.low = NA_real_, conf.high = NA_real_) |
| 51 | |
| 52 | # Calculate confidence intervals for valid rows |
| 53 | # Use the column names from the enquoted expressions |
| 54 | N_col <- quo_name(N) |
| 55 | x_col <- quo_name(x) |
| 56 | valid_indices <- which(df[[N_col]] > 0 & !is.na(df[[N_col]]) & !is.na(df[[x_col]])) |
| 57 | |
| 58 | if (length(valid_indices) > 0) { |
| 59 | valid_data <- df[valid_indices, ] |
| 60 | |
| 61 | ci_results <- valid_data %>% |
| 62 | rowwise %>% |
| 63 | mutate(tst = list( |
| 64 | broom::tidy(prop.test(!!x, !!N, conf.level = conf.level)) %>% |
| 65 | select(estimate, conf.low, conf.high) %>% |
| 66 | rename(f = estimate) |
| 67 | )) %>% |
| 68 | tidyr::unnest(tst) %>% |
| 69 | select(.row_index, f, conf.low, conf.high) |
| 70 | |
| 71 | # Update result with calculated values |
| 72 | for (i in seq_len(nrow(ci_results))) { |
| 73 | row_idx <- ci_results$.row_index[i] |
| 74 | result$f[row_idx] <- ci_results$f[i] |
| 75 | result$conf.low[row_idx] <- ci_results$conf.low[i] |
| 76 | result$conf.high[row_idx] <- ci_results$conf.high[i] |
| 77 | } |
| 78 | } |
| 79 | |
| 80 | # Remove the helper column |
| 81 | result %>% select(-.row_index) |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 82 | } |
| 83 | |
Marc Kupietz | 7d61387 | 2019-10-04 22:47:20 +0200 | [diff] [blame] | 84 | ## Mute notes: "Undefined global functions or variables:" |
Marc Kupietz | 319e746 | 2025-06-04 17:14:03 +0200 | [diff] [blame] | 85 | globalVariables(c("totalResults", "total", "estimate", "tst", ".row_index", "f", "conf.low", "conf.high", "N_col", "x_col")) |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 86 | |
| 87 | |
Marc Kupietz | 71d6e05 | 2019-11-22 18:42:10 +0100 | [diff] [blame] | 88 | # ci.old <- function(df, x = totalResults, N = total, conf.level = 0.95) { |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 89 | # x <- deparse(substitute(x)) |
| 90 | # N <- deparse(substitute(N)) |
| 91 | # df <- data.frame(df) |
| 92 | # df$f <- df[,x] / df[,N] |
| 93 | # df[, c("conf.low", "conf.high")] <- t(sapply(Map(function(a, b) prop.test(a, b, conf.level = conf.level), df[,x], df[,N]), "[[","conf.int")) |
| 94 | # return(df) |
| 95 | # } |