| Marc Kupietz | a97e8d0 | 2019-09-25 20:06:24 +0200 | [diff] [blame] | 1 | #!/usr/bin/env Rscript | 
|  | 2 | # | 
|  | 3 | # Plot frequency of query expressions per topic domain | 
|  | 4 | # | 
|  | 5 | library(RKorAPClient) | 
|  | 6 | library(ggplot2) | 
|  | 7 |  | 
|  | 8 | freqPerDomain <- function(query, con = new("KorAPConnection", verbose = TRUE)) { | 
|  | 9 | q <- corpusQuery(con, query = query, vc="") | 
|  | 10 | q <- fetchAll(q) | 
|  | 11 | tokensPerMainTopic <- | 
|  | 12 | function(topic) { | 
|  | 13 | return(corpusStats(con, sprintf("textClass = /%s.*/", topic))@tokens) | 
|  | 14 | } | 
|  | 15 | q@collectedMatches$primaryTopic <- | 
|  | 16 | sapply(strsplit(as.character(q@collectedMatches$textClass), " "), `[[`, 1) | 
|  | 17 | df <- as.data.frame(table(q@collectedMatches$primaryTopic, dnn = "Domain")) | 
|  | 18 | df$total <- sapply(df$Domain, tokensPerMainTopic) | 
|  | 19 | df$freq <- df$Freq / df$total | 
|  | 20 | df$ci <- t(sapply(Map(prop.test, df$Freq, df$total), "[[","conf.int")) | 
|  | 21 | g <- ggplot(data = df, mapping = aes(x = Domain, y = freq)) + | 
|  | 22 | geom_col() + | 
|  | 23 | geom_errorbar(aes(ymin=ci[, 1], ymax=ci[, 2]), width=.5, alpha=.5) + | 
|  | 24 | ylab(sprintf("Observed frequency of ā%sā", query)) + | 
|  | 25 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) | 
|  | 26 | print(g) | 
|  | 27 | df | 
|  | 28 | } | 
|  | 29 | df <- freqPerDomain("Hatespeech") | 
|  | 30 |  |