Marc Kupietz | a6e4ee6 | 2021-03-05 09:00:15 +0100 | [diff] [blame] | 1 | #' Misc functions |
| 2 | #' |
| 3 | #' @name misc-functions |
| 4 | NULL |
| 5 | #' NULL |
Marc Kupietz | bb7d232 | 2019-10-06 21:42:34 +0200 | [diff] [blame] | 6 | |
| 7 | #' Convert corpus frequency table to instances per million. |
| 8 | #' |
| 9 | #' Convenience function for converting frequency tables to instances per |
| 10 | #' million. |
| 11 | #' |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame^] | 12 | #' Given a table with columns `f`, `conf.low`, and `conf.high`, `ipm` ads a `column ipm` |
| 13 | #' und multiplies conf.low and `conf.high` with 10^6. |
Marc Kupietz | bb7d232 | 2019-10-06 21:42:34 +0200 | [diff] [blame] | 14 | #' |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame^] | 15 | #' @param df table returned from [frequencyQuery()] |
Marc Kupietz | bb7d232 | 2019-10-06 21:42:34 +0200 | [diff] [blame] | 16 | #' |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame^] | 17 | #' @return original table with additional column `ipm` and converted columns `conf.low` and `conf.high` |
Marc Kupietz | bb7d232 | 2019-10-06 21:42:34 +0200 | [diff] [blame] | 18 | #' @export |
| 19 | #' |
Marc Kupietz | a6e4ee6 | 2021-03-05 09:00:15 +0100 | [diff] [blame] | 20 | #' @rdname misc-functions |
Marc Kupietz | bb7d232 | 2019-10-06 21:42:34 +0200 | [diff] [blame] | 21 | #' @importFrom dplyr .data |
| 22 | #' |
| 23 | #' @examples |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 24 | #' \donttest{ |
Marc Kupietz | bb7d232 | 2019-10-06 21:42:34 +0200 | [diff] [blame] | 25 | #' new("KorAPConnection") %>% frequencyQuery("Test", paste0("pubDate in ", 2000:2002)) %>% ipm() |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 26 | #' } |
Marc Kupietz | bb7d232 | 2019-10-06 21:42:34 +0200 | [diff] [blame] | 27 | ipm <- function(df) { |
| 28 | df %>% |
| 29 | mutate(ipm = .data$f * 10^6, conf.low = .data$conf.low * 10^6, conf.high = .data$conf.high * 10^6) |
| 30 | } |
| 31 | |
Marc Kupietz | 23daf5b | 2019-11-27 10:28:07 +0100 | [diff] [blame] | 32 | #' Convert corpus frequency table of alternatives to percent |
| 33 | #' |
| 34 | #' Convenience function for converting frequency tables of alternative variants |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame^] | 35 | #' (generated with `as.alternatives=TRUE`) to percent. |
Marc Kupietz | 23daf5b | 2019-11-27 10:28:07 +0100 | [diff] [blame] | 36 | #' |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame^] | 37 | #' @param df table returned from [frequencyQuery()] |
Marc Kupietz | 23daf5b | 2019-11-27 10:28:07 +0100 | [diff] [blame] | 38 | #' |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame^] | 39 | #' @return original table with converted columns `f`, `conf.low` and `conf.high` |
Marc Kupietz | 23daf5b | 2019-11-27 10:28:07 +0100 | [diff] [blame] | 40 | #' @export |
| 41 | #' |
| 42 | #' @importFrom dplyr .data |
| 43 | #' |
Marc Kupietz | a6e4ee6 | 2021-03-05 09:00:15 +0100 | [diff] [blame] | 44 | #' @rdname misc-functions |
Marc Kupietz | 23daf5b | 2019-11-27 10:28:07 +0100 | [diff] [blame] | 45 | #' @examples |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 46 | #' \donttest{ |
Marc Kupietz | 23daf5b | 2019-11-27 10:28:07 +0100 | [diff] [blame] | 47 | #' new("KorAPConnection") %>% |
| 48 | #' frequencyQuery(c("Tollpatsch", "Tolpatsch"), |
| 49 | #' vc=paste0("pubDate in ", 2000:2002), |
| 50 | #' as.alternatives = TRUE) %>% |
| 51 | #' percent() |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 52 | #' } |
Marc Kupietz | 23daf5b | 2019-11-27 10:28:07 +0100 | [diff] [blame] | 53 | percent <- function(df) { |
| 54 | df %>% |
| 55 | mutate(f = .data$f * 10^2, conf.low = .data$conf.low * 10^2, conf.high = .data$conf.high * 10^2) |
| 56 | } |
| 57 | |
Marc Kupietz | 95240e9 | 2019-11-27 18:19:04 +0100 | [diff] [blame] | 58 | #' Convert query or vc strings to plot labels |
| 59 | #' |
| 60 | #' Converts a vector of query or vc strings to typically appropriate legend labels |
| 61 | #' by clipping off prefixes and suffixes that are common to all query strings. |
| 62 | #' |
| 63 | #' @param data string or vector of query or vc definition strings |
Marc Kupietz | 62d29a1 | 2020-01-18 12:38:36 +0100 | [diff] [blame] | 64 | #' @param pubDateOnly discard all but the publication date |
| 65 | #' @param excludePubDate discard publication date constraints |
Marc Kupietz | 95240e9 | 2019-11-27 18:19:04 +0100 | [diff] [blame] | 66 | #' @return string or vector of strings with clipped off common prefixes and suffixes |
| 67 | #' |
Marc Kupietz | a6e4ee6 | 2021-03-05 09:00:15 +0100 | [diff] [blame] | 68 | #' @rdname misc-functions |
| 69 | #' |
Marc Kupietz | 95240e9 | 2019-11-27 18:19:04 +0100 | [diff] [blame] | 70 | #' @examples |
| 71 | #' queryStringToLabel(paste("textType = /Zeit.*/ & pubDate in", c(2010:2019))) |
| 72 | #' queryStringToLabel(c("[marmot/m=mood:subj]", "[marmot/m=mood:ind]")) |
| 73 | #' queryStringToLabel(c("wegen dem [tt/p=NN]", "wegen des [tt/p=NN]")) |
| 74 | #' |
| 75 | #' @importFrom PTXQC lcpCount |
| 76 | #' @importFrom PTXQC lcsCount |
| 77 | #' |
| 78 | #' @export |
Marc Kupietz | cf1771d | 2020-03-04 16:03:04 +0100 | [diff] [blame] | 79 | queryStringToLabel <- function(data, pubDateOnly = FALSE, excludePubDate = FALSE) { |
Marc Kupietz | 62d29a1 | 2020-01-18 12:38:36 +0100 | [diff] [blame] | 80 | if (pubDateOnly) { |
Marc Kupietz | 7aa4f19 | 2021-03-05 10:51:24 +0100 | [diff] [blame] | 81 | data <-substring(data, regexpr("(pub|creation)Date", data)+7) |
Marc Kupietz | 62d29a1 | 2020-01-18 12:38:36 +0100 | [diff] [blame] | 82 | } else if(excludePubDate) { |
Marc Kupietz | 7aa4f19 | 2021-03-05 10:51:24 +0100 | [diff] [blame] | 83 | data <-substring(data, 1, regexpr("(pub|creation)Date", data)) |
Marc Kupietz | 62d29a1 | 2020-01-18 12:38:36 +0100 | [diff] [blame] | 84 | } |
Marc Kupietz | 95240e9 | 2019-11-27 18:19:04 +0100 | [diff] [blame] | 85 | leftCommon = lcpCount(data) |
Marc Kupietz | 62d29a1 | 2020-01-18 12:38:36 +0100 | [diff] [blame] | 86 | while (leftCommon > 0 && grepl("[[:alnum:]/=.*!]", substring(data[1], leftCommon, leftCommon))) { |
Marc Kupietz | 95240e9 | 2019-11-27 18:19:04 +0100 | [diff] [blame] | 87 | leftCommon <- leftCommon - 1 |
| 88 | } |
| 89 | rightCommon = lcsCount(data) |
Marc Kupietz | 62d29a1 | 2020-01-18 12:38:36 +0100 | [diff] [blame] | 90 | while (rightCommon > 0 && grepl("[[:alnum:]/=.*!]", substring(data[1], 1+nchar(data[1]) - rightCommon, 1+nchar(data[1]) - rightCommon))) { |
Marc Kupietz | 95240e9 | 2019-11-27 18:19:04 +0100 | [diff] [blame] | 91 | rightCommon <- rightCommon - 1 |
| 92 | } |
| 93 | substring(data, leftCommon + 1, nchar(data) - rightCommon) |
| 94 | } |
| 95 | |
Marc Kupietz | bb7d232 | 2019-10-06 21:42:34 +0200 | [diff] [blame] | 96 | |
Marc Kupietz | 865760f | 2019-10-07 19:29:44 +0200 | [diff] [blame] | 97 | ## Mute notes: "Undefined global functions or variables:" |
| 98 | globalVariables(c("conf.high", "conf.low", "onRender", "webUIRequestUrl")) |
| 99 | |
| 100 | |
| 101 | #' Experimental: Plot frequency by year graphs with confidence intervals |
Marc Kupietz | d68f971 | 2019-10-06 21:48:00 +0200 | [diff] [blame] | 102 | #' |
Marc Kupietz | 865760f | 2019-10-07 19:29:44 +0200 | [diff] [blame] | 103 | #' Experimental convenience function for plotting typical frequency by year graphs with confidence intervals using ggplot2. |
Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame^] | 104 | #' **Warning:** This function may be moved to a new package. |
Marc Kupietz | 865760f | 2019-10-07 19:29:44 +0200 | [diff] [blame] | 105 | #' |
| 106 | #' @param mapping Set of aesthetic mappings created by aes() or aes_(). If specified and inherit.aes = TRUE (the default), it is combined with the default mapping at the top level of the plot. You must supply mapping if there is no plot mapping. |
| 107 | #' @param ... Other arguments passed to geom_ribbon, geom_line, and geom_click_point. |
Marc Kupietz | d68f971 | 2019-10-06 21:48:00 +0200 | [diff] [blame] | 108 | #' |
Marc Kupietz | a6e4ee6 | 2021-03-05 09:00:15 +0100 | [diff] [blame] | 109 | #' @rdname misc-functions |
| 110 | #' |
Marc Kupietz | d68f971 | 2019-10-06 21:48:00 +0200 | [diff] [blame] | 111 | #' @examples |
| 112 | #' library(ggplot2) |
| 113 | #' kco <- new("KorAPConnection", verbose=TRUE) |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 114 | #' \donttest{ |
Marc Kupietz | d68f971 | 2019-10-06 21:48:00 +0200 | [diff] [blame] | 115 | #' expand_grid(condition = c("textDomain = /Wirtschaft.*/", "textDomain != /Wirtschaft.*/"), |
Marc Kupietz | 2fbac3d | 2020-01-18 11:01:21 +0100 | [diff] [blame] | 116 | #' year = (2005:2011)) %>% |
Marc Kupietz | d68f971 | 2019-10-06 21:48:00 +0200 | [diff] [blame] | 117 | #' cbind(frequencyQuery(kco, "[tt/l=Heuschrecke]", |
| 118 | #' paste0(.$condition," & pubDate in ", .$year))) %>% |
| 119 | #' ipm() %>% |
Marc Kupietz | 865760f | 2019-10-07 19:29:44 +0200 | [diff] [blame] | 120 | #' ggplot(aes(year, ipm, fill = condition, color = condition)) + |
Marc Kupietz | d68f971 | 2019-10-06 21:48:00 +0200 | [diff] [blame] | 121 | #' geom_freq_by_year_ci() |
Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 122 | #' } |
Marc Kupietz | 865760f | 2019-10-07 19:29:44 +0200 | [diff] [blame] | 123 | #' @importFrom ggplot2 ggplot aes geom_ribbon geom_line geom_point theme element_text scale_x_continuous |
Marc Kupietz | d68f971 | 2019-10-06 21:48:00 +0200 | [diff] [blame] | 124 | #' |
| 125 | #' @export |
Marc Kupietz | 865760f | 2019-10-07 19:29:44 +0200 | [diff] [blame] | 126 | geom_freq_by_year_ci <- function(mapping = aes(ymin=conf.low, ymax=conf.high), ...) { |
Marc Kupietz | d68f971 | 2019-10-06 21:48:00 +0200 | [diff] [blame] | 127 | list( |
Marc Kupietz | 865760f | 2019-10-07 19:29:44 +0200 | [diff] [blame] | 128 | geom_ribbon(mapping, |
| 129 | alpha = .3, linetype = 0, show.legend = FALSE, ...), |
| 130 | geom_line(...), |
| 131 | geom_click_point(aes(url=webUIRequestUrl), ...), |
| 132 | theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = c(0.8, 0.2)), |
Marc Kupietz | d68f971 | 2019-10-06 21:48:00 +0200 | [diff] [blame] | 133 | scale_x_continuous(breaks = function(x) seq(ceiling(x[1]), floor(x[2]), by = 1 + floor(((x[2]-x[1])/30))))) |
| 134 | } |
| 135 | |
Marc Kupietz | 5fb892e | 2021-03-05 08:18:25 +0100 | [diff] [blame] | 136 | #' |
Marc Kupietz | 865760f | 2019-10-07 19:29:44 +0200 | [diff] [blame] | 137 | #' @importFrom ggplot2 ggproto aes GeomPoint |
Marc Kupietz | 5fb892e | 2021-03-05 08:18:25 +0100 | [diff] [blame] | 138 | #' |
| 139 | GeomClickPoint <- ggplot2::ggproto( |
Marc Kupietz | 865760f | 2019-10-07 19:29:44 +0200 | [diff] [blame] | 140 | "GeomPoint", |
Marc Kupietz | 5fb892e | 2021-03-05 08:18:25 +0100 | [diff] [blame] | 141 | ggplot2::GeomPoint, |
Marc Kupietz | 865760f | 2019-10-07 19:29:44 +0200 | [diff] [blame] | 142 | required_aes = c("x", "y"), |
| 143 | default_aes = aes( |
| 144 | shape = 19, colour = "black", size = 1.5, fill = NA, |
| 145 | alpha = NA, stroke = 0.5, url = NA |
| 146 | ), |
| 147 | extra_params = c("na.rm", "url"), |
| 148 | draw_panel = function(data, panel_params, |
| 149 | coord, na.rm = FALSE, showpoints = TRUE, url = NULL) { |
| 150 | GeomPoint$draw_panel(data, panel_params, coord, na.rm = na.rm) |
| 151 | } |
| 152 | ) |
| 153 | |
| 154 | #' @importFrom ggplot2 layer |
| 155 | geom_click_point <- function(mapping = NULL, data = NULL, stat = "identity", |
Marc Kupietz | 5fb892e | 2021-03-05 08:18:25 +0100 | [diff] [blame] | 156 | position = "identity", na.rm = FALSE, show.legend = NA, |
| 157 | inherit.aes = TRUE, url = NA, ...) { |
Marc Kupietz | 865760f | 2019-10-07 19:29:44 +0200 | [diff] [blame] | 158 | layer( |
| 159 | geom = GeomClickPoint, mapping = mapping, data = data, stat = stat, |
| 160 | position = position, show.legend = show.legend, inherit.aes = inherit.aes, |
| 161 | params = list(na.rm = na.rm, ...) |
| 162 | ) |
| 163 | } |
| 164 | |