blob: 12b2edc12e32b45ca19b981f51c619dc1e0fcf01 [file] [log] [blame]
Marc Kupietza97e8d02019-09-25 20:06:24 +02001#!/usr/bin/env Rscript
2#
3# Plot frequency of query expressions per topic domain
4#
5library(RKorAPClient)
6library(ggplot2)
7
8freqPerDomain <- function(query, con = new("KorAPConnection", verbose = TRUE)) {
Marc Kupietzb04a1cb2019-10-07 11:22:41 +02009 g <- corpusQuery(con, query = query, vc="") %>%
10 fetchAll() %>%
11 slot("collectedMatches") %>%
12 mutate(Domain = sapply(strsplit(as.character(.$textClass), " "), `[[`, 1)) %>%
13 group_by(Domain) %>%
14 filter(!is.na(Domain)) %>%
15 summarise(count = dplyr::n()) %>%
16 mutate(tokens = (corpusStats(con, sprintf("textClass = /%s.*/", .$Domain)))$tokens) %>%
17 ci(x = count) %>%
18 ipm() %>%
19 { df <<- . } %>%
20 ggplot(aes(x = Domain, y = ipm, ymin = conf.low, ymax = conf.high)) +
Marc Kupietza97e8d02019-09-25 20:06:24 +020021 geom_col() +
Marc Kupietzb04a1cb2019-10-07 11:22:41 +020022 geom_errorbar(width = .3, alpha = .3) +
23 ylab(sprintf("Observed frequency/million of \u201c%s\u201d", query)) +
Marc Kupietza97e8d02019-09-25 20:06:24 +020024 theme(axis.text.x = element_text(angle = 45, hjust = 1))
25 print(g)
26 df
27}
28df <- freqPerDomain("Hatespeech")
29