Marc Kupietz | 827a3c1 | 2019-09-18 22:09:33 +0200 | [diff] [blame] | 1 | #!/usr/bin/env Rscript |
| 2 | # |
| 3 | # Plot frequency of alternative expressions or spellings variants over time |
| 4 | # |
| 5 | library(RKorAPClient) |
| 6 | library(ggplot2) |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 7 | library(tidyr) |
| 8 | library(dplyr) |
Marc Kupietz | 827a3c1 | 2019-09-18 22:09:33 +0200 | [diff] [blame] | 9 | library(plotly) |
| 10 | library(htmlwidgets) |
| 11 | |
| 12 | alternativesOverTime <- function(alternatives, years, kco = new("KorAPConnection", verbose=TRUE)) { |
Marc Kupietz | 827a3c1 | 2019-09-18 22:09:33 +0200 | [diff] [blame] | 13 | vc = "textType = /Zeit.*/ & pubDate in" |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 14 | df <- data.frame(matrix(ncol = length(alternatives), nrow = length(years))) %>% |
| 15 | setNames(alternatives) %>% |
| 16 | mutate(year = years) %>% |
| 17 | pivot_longer(cols = alternatives) %>% |
Marc Kupietz | 296e493 | 2019-10-04 22:51:11 +0200 | [diff] [blame^] | 18 | mutate(value = corpusQuery(kco, query=name, vc=paste(vc, year))$totalResults) %>% |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 19 | pivot_wider(id_cols= year, names_from = name) %>% |
| 20 | mutate(total = rowSums(.[alternatives])) %>% |
| 21 | pivot_longer(cols = alternatives) %>% |
| 22 | mutate(share = value / total) %>% |
Marc Kupietz | 296e493 | 2019-10-04 22:51:11 +0200 | [diff] [blame^] | 23 | mutate(url = corpusQuery(kco, query=name, vc=paste(vc, year))$webUIRequestUrl) %>% |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 24 | rename(Variant = name) |
| 25 | df$ci <- t(sapply(Map(prop.test, df$value, df$total), "[[","conf.int")) |
Marc Kupietz | 827a3c1 | 2019-09-18 22:09:33 +0200 | [diff] [blame] | 26 | g <- ggplot(data = df, mapping = aes(x = year, y = share, color=Variant, fill=Variant)) + |
| 27 | geom_ribbon(aes(ymin=ci[, 1], ymax=ci[, 2], color=Variant, fill=Variant), alpha=.3, linetype=0) + |
| 28 | geom_line() + |
| 29 | geom_point() + |
| 30 | ggtitle(paste0(alternatives, collapse = " vs. ")) + |
| 31 | xlab("TIME") + |
| 32 | ylab(sprintf("Observed frequency ratio")) + |
| 33 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_continuous(breaks=unique(df$year)) |
| 34 | pp <- ggplotly(g, tooltip = c("x", "y")) |
| 35 | for (i in 1:length(alternatives)) { |
| 36 | vdata <- df[df$Variant==alternatives[i],] |
| 37 | pp$x$data[[2+i]]$customdata <- vdata$url |
Marc Kupietz | 69cc54a | 2019-09-30 12:06:54 +0200 | [diff] [blame] | 38 | pp$x$data[[2+i]]$text <- sprintf("%s<br />absolute: %d / %d", pp$x$data[[2+i]]$text, vdata$value, vdata$total) |
Marc Kupietz | 827a3c1 | 2019-09-18 22:09:33 +0200 | [diff] [blame] | 39 | } |
| 40 | ppp <- onRender(pp, "function(el, x) { el.on('plotly_click', function(d) { var url=d.points[0].customdata; window.open(url, 'korap') })}") |
| 41 | print(ppp) |
| 42 | df |
| 43 | } |
| 44 | |
| 45 | df <- alternativesOverTime(c('so "genannte.?"', '"sogenannte.?"'), (1995:2018)) |