blob: 475d2ebf11c5f497d1a2525376f17ca29e347e40 [file] [log] [blame]
Marc Kupietze457d992019-09-29 18:17:05 +02001#!/usr/bin/env Rscript
2#
3# Plot frequency of query expressions over time
4#
5library(RKorAPClient)
6library(ggplot2)
7
8freqPerYear <- function(query, con = new("KorAPConnection", verbose = TRUE)) {
9 vc <- "pubDate since 2000 & pubDate until 2018 & textType = /Zeit.*/"
10 q <- corpusQuery(con, query = query, vc=vc)
11 q <- fetchAll(q)
12 tokensPerYear <- function(year) {
13 return(corpusStats(con, sprintf("%s & pubDate in %s", vc, year))@tokens)
14 }
15 df <- as.data.frame(table(as.numeric(format(q@collectedMatches$pubDate,"%Y")), dnn="year"),
16 stringsAsFactors = FALSE)
17 df <- merge(data.frame(year=min(df$year):max(df$year)), df, all = TRUE)
18 df[is.na(df$Freq),]$Freq <- 0
19 df$total <- sapply(df$year, tokensPerYear)
20 df$freq <- df$Freq / df$total
21 df$ci <- t(sapply(Map(prop.test, df$Freq, df$total), "[[","conf.int"))
22 g <- ggplot(data = df, aes(x = year, y = freq, group=1)) +
23 geom_ribbon(aes(ymin=ci[, 1], ymax=ci[, 2]), alpha=.3) +
24 geom_point() +
25 geom_line() +
26 xlab("TIME") +
27 ylab(sprintf("Observed frequency of \u201c%s\u201d", query)) +
28 theme(axis.text.x = element_text(angle = 45, hjust = 1))
29 print(g)
30 df
31}
32#df <- freqPerYear("Car-Bikini")
33#df <- freqPerYear("[tt/p=ART & opennlp/p=ART] [tt/l=teilweise] [tt/p=NN]")
34df <- freqPerYear("Buschzulage")
35