blob: 4c5c5297e46528705d297778190ce528582d6403 [file] [log] [blame]
#!/usr/bin/env Rscript
#
# Plot frequency of query expressions per topic domain
#
library(RKorAPClient)
library(ggplot2)
freqPerDomain <- function(query, con = new("KorAPConnection", verbose = TRUE)) {
q <- corpusQuery(con, query = query, vc="")
q <- fetchAll(q)
tokensPerMainTopic <-
function(topic) {
return(corpusStats(con, sprintf("textClass = /%s.*/", topic))@tokens)
}
q@collectedMatches$primaryTopic <-
sapply(strsplit(as.character(q@collectedMatches$textClass), " "), `[[`, 1)
df <- as.data.frame(table(q@collectedMatches$primaryTopic, dnn = "Domain"))
df$total <- sapply(df$Domain, tokensPerMainTopic)
df$freq <- df$Freq / df$total
df$ci <- t(sapply(Map(prop.test, df$Freq, df$total), "[[","conf.int"))
g <- ggplot(data = df, mapping = aes(x = Domain, y = freq)) +
geom_col() +
geom_errorbar(aes(ymin=ci[, 1], ymax=ci[, 2]), width=.5, alpha=.5) +
ylab(sprintf("Observed frequency of \u201c%s\u201d", query)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(g)
df
}
df <- freqPerDomain("Hatespeech")