blob: f7bdda65bf1d0ab2dfef7a1f02528f4e894daf57 [file] [log] [blame]
Marc Kupietza0868ec2023-06-02 17:59:40 +02001source("R/common.R")
Marc Kaa2ebf62023-05-26 13:16:01 +02002
3icc_genre <- icc %>%
4 expand_grid(genre) %>%
5 mutate(vc = paste0("iccGenre=", genre)) %>%
6 rowwise() %>%
7 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
8
Marc Kupietz1f5969c2023-05-27 11:36:08 +02009plot <- icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
Marc Kaa2ebf62023-05-26 13:16:01 +020010 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
11 theme_ids() +
12 geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
13
Marc Kupietz1f5969c2023-05-27 11:36:08 +020014ggsave("target/tokens_per_genre.png", width = 70 * .pt, height = 45 *.pt, units = "mm", dpi = 800)
15ggsave("target/tokens_per_genre.svg", width = 70 * .pt, height = 45 *.pt, units = "mm", dpi = 800)
16ggsave("target/tokens_per_genre.pdf", device = cairo_pdf, width = 70 * .pt, height = 45 *.pt, units = "mm", dpi = 800)
17
18if(rstudioapi::isAvailable()) {
19 print(plot)
20}
21
Marc Kaa2ebf62023-05-26 13:16:01 +020022year <- c(1988:2022)
23
24icc_year <- icc %>%
25 expand_grid(year) %>%
26 mutate(vc = paste0("pubDate in ", year)) %>%
27 rowwise() %>%
28 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
29
Marc Kupietz1f5969c2023-05-27 11:36:08 +020030plot <- icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
31 geom_line() + geom_point() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
Marc Kaa2ebf62023-05-26 13:16:01 +020032 theme_ids()
Marc Kupietz1f5969c2023-05-27 11:36:08 +020033
34ggsave("target/tokens_per_year.png", width = 70 * .pt, height = 45 *.pt, units = "mm", dpi = 800)
35ggsave("target/tokens_per_year.svg", width = 70 * .pt, height = 45 *.pt, units = "mm", dpi = 800)
36ggsave("target/tokens_per_year.pdf", device = cairo_pdf, width = 70 * .pt, height = 45 *.pt, units = "mm", dpi = 800)
37
38if(rstudioapi::isAvailable()) {
39 print(plot)
40}
41
Marc Kupietz73263fc2023-05-27 14:27:11 +020042POS_tag <- c(
43 "ADJ", "ADP",# "PUNCT",
44 "ADV", "AUX", # "SYM",
45 "INTJ", "CCONJ", # "X",
46 "NOUN", "DET",
47 "PROPN", #"NUM",
48 "VERB", #"PART",
49 "PRON",
50 "SCONJ"
51 )
52
53icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
54 rowwise() %>%
55 mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
56
57plot <- icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
58 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
59 theme_ids(base_size = 12) +
60 geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
61
62ggsave("target/pos_proportions.png", width = 70 * .pt, height = 45 *.pt, units = "mm", dpi = 800)
63ggsave("target/pos_proportions.svg", width = 70 * .pt, height = 45 *.pt, units = "mm", dpi = 800)
64ggsave("target/pos_proportions.pdf", device = cairo_pdf, width = 70 * .pt, height = 45 *.pt, units = "mm", dpi = 800)
65
66if(rstudioapi::isAvailable()) {
67 print(plot)
68}