blob: 4c5c5297e46528705d297778190ce528582d6403 [file] [log] [blame]
Marc Kupietza97e8d02019-09-25 20:06:24 +02001#!/usr/bin/env Rscript
2#
3# Plot frequency of query expressions per topic domain
4#
5library(RKorAPClient)
6library(ggplot2)
7
8freqPerDomain <- function(query, con = new("KorAPConnection", verbose = TRUE)) {
9 q <- corpusQuery(con, query = query, vc="")
10 q <- fetchAll(q)
11 tokensPerMainTopic <-
12 function(topic) {
13 return(corpusStats(con, sprintf("textClass = /%s.*/", topic))@tokens)
14 }
15 q@collectedMatches$primaryTopic <-
16 sapply(strsplit(as.character(q@collectedMatches$textClass), " "), `[[`, 1)
17 df <- as.data.frame(table(q@collectedMatches$primaryTopic, dnn = "Domain"))
18 df$total <- sapply(df$Domain, tokensPerMainTopic)
19 df$freq <- df$Freq / df$total
20 df$ci <- t(sapply(Map(prop.test, df$Freq, df$total), "[[","conf.int"))
21 g <- ggplot(data = df, mapping = aes(x = Domain, y = freq)) +
22 geom_col() +
23 geom_errorbar(aes(ymin=ci[, 1], ymax=ci[, 2]), width=.5, alpha=.5) +
Marc Kupietze457d992019-09-29 18:17:05 +020024 ylab(sprintf("Observed frequency of \u201c%s\u201d", query)) +
Marc Kupietza97e8d02019-09-25 20:06:24 +020025 theme(axis.text.x = element_text(angle = 45, hjust = 1))
26 print(g)
27 df
28}
29df <- freqPerDomain("Hatespeech")
30