blob: 5f28216498374c49e200eb14c81f61ecf8b8d59a [file] [log] [blame]
Marc Kupietz6e21b102023-06-02 18:04:04 +02001---
2title: "ICC Written Launch"
3output:
4 html_document:
5 css: style.css
6 self_contained: yes
7date: "`r Sys.Date()`"
8---
9
10```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
11knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
12source("common.R")
13```
14
15# Actual composition of ICC parts
16
17## Composition by ICC genre
18
19```{r composition_by_genre, message = FALSE}
20icc_genre <- icc %>%
21 expand_grid(genre) %>%
22 mutate(vc = paste0("iccGenre=", genre)) %>%
23 rowwise() %>%
24 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
25
26icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
27 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
28 theme_ids() +
29 geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
30
31```
32
33## Composition by date of publication
34
35
36```{r composition_by_pubdate}
37year <- c(1988:2023)
38
39icc_year <- icc %>%
40 expand_grid(year) %>%
41 mutate(vc = paste0("pubDate in ", year)) %>%
42 rowwise() %>%
43 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
44
45icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
46 geom_line() +
47 geom_point() +
48 scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
49 theme_ids()
50```
51
52## Part-of-Speech proportions
53
54```{r pos_proportions}
55POS_tag <- c(
56 "ADJ", "ADP",# "PUNCT",
57 "ADV", "AUX", # "SYM",
58 "INTJ", "CCONJ", # "X",
59 "NOUN", "DET",
60 "PROPN", #"NUM",
61 "VERB", #"PART",
62 "PRON",
63 "SCONJ"
64 )
65
66icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
67 rowwise() %>%
68 mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
69
70icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
71 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
72 theme_ids(base_size = 12) +
73 geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
74```
75
76# Pilot study: Identification of Light Verb Constructions with *take*
77
78```{r prepare_ca, output=FALSE, message=FALSE}
79
80
81
82
83```
84
85## English: *take*
86
87```{r take_icc, echo=TRUE, message=FALSE}
88take_ca_icc <-
89 collocationAnalysis(
90 icc_con("eng"),
91 "focus({[ud/l=take]} [ud/p=NOUN])",
92 leftContextSize = 0,
93 rightContextSize = 1,
94 minOccur = 2,
95 addExamples = T
96 )
97
98take_ca_icc %>% show_table()
99```
100
101## German: *nehmen*
102
103```{r nehmen_icc, echo=TRUE}
104nehmen_ca_icc <-
105 collocationAnalysis(
106 icc_con("ger"),
107 "focus([tt/p=NN] {[tt/l=nehmen]})",
108 leftContextSize = 1,
109 rightContextSize = 0,
110 minOccur = 2,
111 addExamples = T
112 )
113nehmen_ca_icc %>% show_table()
114```
115
116### For comparison based on the whole DeReKo
117
118```{r nehmen_dereko}
119nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
120nehmen_ca_dereko %>% show_table()
121```
122
123## Norwegian: *ta*
124
125# ```{r ta_icc, echo=TRUE, message=FALSE}
126ta_ca_icc <-
127 collocationAnalysis(
128 icc_con("nor"),
129 "focus({[ud/l=ta]} [ud/p=NOUN])",
130 leftContextSize = 0,
131 rightContextSize = 1,
132 minOccur = 2,
133 addExamples = T
134 )
135ta_ca_icc %>% show_table()
136```
137