Marc Kupietz | 9d57d4b | 2019-09-25 20:02:37 +0200 | [diff] [blame] | 1 | #!/usr/bin/env Rscript |
| 2 | # |
| 3 | # Plot frequency of an expressions under multiple conditions over time |
| 4 | # |
| 5 | #library(devtools) |
| 6 | #install_git("https://korap.ids-mannheim.de/gerrit/KorAP/RKorAPClient", upgrade="never") |
| 7 | library(RKorAPClient) |
| 8 | library(ggplot2) |
| 9 | library(reshape2) |
| 10 | #library(plotly) |
| 11 | |
| 12 | conditionsOverTime <- function(query, conditions, years, kco = new("KorAPConnection", verbose = TRUE)) { |
| 13 | df = data.frame(year=years) |
| 14 | for (c in conditions) { |
| 15 | df[c] <- sapply(df$year, function(y) |
| 16 | corpusQuery(kco, query, vc=paste(c, "& pubDate in", y))@totalResults) |
| 17 | |
| 18 | } |
| 19 | df <- melt(df, measure.vars = conditions, value.name = "afreq", variable.name = "condition") |
| 20 | df$total <- apply(df[,c('year','condition')], 1, function(x) corpusStats(kco, vc=paste(x[2], "& pubDate in", x[1]))@tokens ) |
| 21 | df$ci <- t(sapply(Map(prop.test, df$afreq, df$total), "[[","conf.int")) |
| 22 | df$freq <- df$afreq / df$total |
| 23 | g <- ggplot(data = df, mapping = aes(x = year, y = freq, fill=condition, color=condition)) + |
| 24 | geom_point() + |
| 25 | geom_line() + |
| 26 | geom_ribbon(aes(ymin=ci[, 1], ymax=ci[, 2], fill=condition, color=condition), alpha=.3, linetype=0) + |
| 27 | xlab("TIME") + |
| 28 | labs(color="Virtual Corpus", fill="Virtual Corpus") + |
Marc Kupietz | e457d99 | 2019-09-29 18:17:05 +0200 | [diff] [blame^] | 29 | ylab(sprintf("Observed frequency of \u201c%s\u201d", query)) + |
Marc Kupietz | 9d57d4b | 2019-09-25 20:02:37 +0200 | [diff] [blame] | 30 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_continuous(breaks=unique(df$year)) |
| 31 | print(g) |
| 32 | # print(ggplotly(g, tooltip = c("x", "y"))) |
| 33 | |
| 34 | df |
| 35 | } |
| 36 | |
| 37 | df <- conditionsOverTime("[tt/l=Heuschrecke]", c("textClass = /natur.*/", "textClass=/politik.*/", "textClass=/wirtschaft.*/"), (2002:2018)) |
| 38 | #df <- conditionsOverTime("wegen dem [tt/p=NN]", c("textClass = /sport.*/", "textClass=/politik.*/", "textClass=/kultur.*/"), (1995:2005)) |