blob: b730917c3155c254a9f91aec4616ac7d34c46f5a [file] [log] [blame]
Marc Kupietz6e21b102023-06-02 18:04:04 +02001---
2title: "ICC Written Launch"
3output:
4 html_document:
5 css: style.css
6 self_contained: yes
7date: "`r Sys.Date()`"
8---
9
10```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
11knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
12source("common.R")
13```
14
15# Actual composition of ICC parts
16
17## Composition by ICC genre
18
19```{r composition_by_genre, message = FALSE}
20icc_genre <- icc %>%
21 expand_grid(genre) %>%
22 mutate(vc = paste0("iccGenre=", genre)) %>%
23 rowwise() %>%
24 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
25
26icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
27 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
28 theme_ids() +
Marc Kupietz07645fb2023-06-07 11:31:07 +020029 scale_fill_ids() +
30 geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")
Marc Kupietz6e21b102023-06-02 18:04:04 +020031
32```
33
34## Composition by date of publication
35
36
37```{r composition_by_pubdate}
38year <- c(1988:2023)
39
40icc_year <- icc %>%
41 expand_grid(year) %>%
42 mutate(vc = paste0("pubDate in ", year)) %>%
43 rowwise() %>%
44 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
45
46icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
47 geom_line() +
48 geom_point() +
49 scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
50 theme_ids()
51```
52
53## Part-of-Speech proportions
54
55```{r pos_proportions}
56POS_tag <- c(
57 "ADJ", "ADP",# "PUNCT",
58 "ADV", "AUX", # "SYM",
Marc Kupietzddda0282023-06-07 17:48:37 +020059 # "INTJ",
60 "CCONJ", # "X",
Marc Kupietz6e21b102023-06-02 18:04:04 +020061 "NOUN", "DET",
62 "PROPN", #"NUM",
63 "VERB", #"PART",
64 "PRON",
65 "SCONJ"
66 )
67
68icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
69 rowwise() %>%
70 mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
71
72icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
73 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
Marc Kupietzd5540e92023-06-07 17:48:01 +020074 scale_fill_ids() + scale_color_ids() +
Marc Kupietz6e21b102023-06-02 18:04:04 +020075 theme_ids(base_size = 12) +
Marc Kupietzd5540e92023-06-07 17:48:01 +020076 geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")
Marc Kupietz6e21b102023-06-02 18:04:04 +020077```
78
79# Pilot study: Identification of Light Verb Constructions with *take*
80
81```{r prepare_ca, output=FALSE, message=FALSE}
82
83
84
85
86```
87
88## English: *take*
89
90```{r take_icc, echo=TRUE, message=FALSE}
91take_ca_icc <-
92 collocationAnalysis(
93 icc_con("eng"),
94 "focus({[ud/l=take]} [ud/p=NOUN])",
95 leftContextSize = 0,
96 rightContextSize = 1,
97 minOccur = 2,
98 addExamples = T
99 )
100
101take_ca_icc %>% show_table()
102```
103
104## German: *nehmen*
105
106```{r nehmen_icc, echo=TRUE}
107nehmen_ca_icc <-
108 collocationAnalysis(
109 icc_con("ger"),
110 "focus([tt/p=NN] {[tt/l=nehmen]})",
111 leftContextSize = 1,
112 rightContextSize = 0,
113 minOccur = 2,
114 addExamples = T
115 )
116nehmen_ca_icc %>% show_table()
117```
118
119### For comparison based on the whole DeReKo
120
121```{r nehmen_dereko}
122nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
123nehmen_ca_dereko %>% show_table()
124```
125
126## Norwegian: *ta*
127
Marc Kupietzd75a1792023-06-07 17:47:23 +0200128```{r ta_icc, echo=T, message=FALSE}
Marc Kupietz6e21b102023-06-02 18:04:04 +0200129ta_ca_icc <-
130 collocationAnalysis(
131 icc_con("nor"),
132 "focus({[ud/l=ta]} [ud/p=NOUN])",
133 leftContextSize = 0,
134 rightContextSize = 1,
135 minOccur = 2,
136 addExamples = T
137 )
138ta_ca_icc %>% show_table()
139```
140