blob: 5366ddbb691bb54b28a8403296edc93ece0722af [file] [log] [blame]
Marc Kupietz6e21b102023-06-02 18:04:04 +02001---
2title: "ICC Written Launch"
3output:
4 html_document:
5 css: style.css
6 self_contained: yes
7date: "`r Sys.Date()`"
8---
9
10```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
11knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
12source("common.R")
13```
14
15# Actual composition of ICC parts
16
17## Composition by ICC genre
18
19```{r composition_by_genre, message = FALSE}
20icc_genre <- icc %>%
21 expand_grid(genre) %>%
22 mutate(vc = paste0("iccGenre=", genre)) %>%
23 rowwise() %>%
24 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
25
26icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
27 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
28 theme_ids() +
Marc Kupietz07645fb2023-06-07 11:31:07 +020029 scale_fill_ids() +
30 geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")
Marc Kupietz6e21b102023-06-02 18:04:04 +020031
32```
33
34## Composition by date of publication
35
36
37```{r composition_by_pubdate}
38year <- c(1988:2023)
39
40icc_year <- icc %>%
41 expand_grid(year) %>%
42 mutate(vc = paste0("pubDate in ", year)) %>%
43 rowwise() %>%
44 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
45
46icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
47 geom_line() +
48 geom_point() +
49 scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
50 theme_ids()
51```
52
53## Part-of-Speech proportions
54
55```{r pos_proportions}
56POS_tag <- c(
57 "ADJ", "ADP",# "PUNCT",
58 "ADV", "AUX", # "SYM",
59 "INTJ", "CCONJ", # "X",
60 "NOUN", "DET",
61 "PROPN", #"NUM",
62 "VERB", #"PART",
63 "PRON",
64 "SCONJ"
65 )
66
67icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
68 rowwise() %>%
69 mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
70
71icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
72 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
Marc Kupietzd5540e92023-06-07 17:48:01 +020073 scale_fill_ids() + scale_color_ids() +
Marc Kupietz6e21b102023-06-02 18:04:04 +020074 theme_ids(base_size = 12) +
Marc Kupietzd5540e92023-06-07 17:48:01 +020075 geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")
Marc Kupietz6e21b102023-06-02 18:04:04 +020076```
77
78# Pilot study: Identification of Light Verb Constructions with *take*
79
80```{r prepare_ca, output=FALSE, message=FALSE}
81
82
83
84
85```
86
87## English: *take*
88
89```{r take_icc, echo=TRUE, message=FALSE}
90take_ca_icc <-
91 collocationAnalysis(
92 icc_con("eng"),
93 "focus({[ud/l=take]} [ud/p=NOUN])",
94 leftContextSize = 0,
95 rightContextSize = 1,
96 minOccur = 2,
97 addExamples = T
98 )
99
100take_ca_icc %>% show_table()
101```
102
103## German: *nehmen*
104
105```{r nehmen_icc, echo=TRUE}
106nehmen_ca_icc <-
107 collocationAnalysis(
108 icc_con("ger"),
109 "focus([tt/p=NN] {[tt/l=nehmen]})",
110 leftContextSize = 1,
111 rightContextSize = 0,
112 minOccur = 2,
113 addExamples = T
114 )
115nehmen_ca_icc %>% show_table()
116```
117
118### For comparison based on the whole DeReKo
119
120```{r nehmen_dereko}
121nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
122nehmen_ca_dereko %>% show_table()
123```
124
125## Norwegian: *ta*
126
Marc Kupietzd75a1792023-06-07 17:47:23 +0200127```{r ta_icc, echo=T, message=FALSE}
Marc Kupietz6e21b102023-06-02 18:04:04 +0200128ta_ca_icc <-
129 collocationAnalysis(
130 icc_con("nor"),
131 "focus({[ud/l=ta]} [ud/p=NOUN])",
132 leftContextSize = 0,
133 rightContextSize = 1,
134 minOccur = 2,
135 addExamples = T
136 )
137ta_ca_icc %>% show_table()
138```
139