blob: 48041150c2a9b19b486c2f6301398893c33283b2 [file] [log] [blame]
Marc Kupietz6e21b102023-06-02 18:04:04 +02001---
2title: "ICC Written Launch"
3output:
4 html_document:
5 css: style.css
6 self_contained: yes
7date: "`r Sys.Date()`"
8---
9
10```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
11knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
12source("common.R")
13```
14
15# Actual composition of ICC parts
16
17## Composition by ICC genre
18
19```{r composition_by_genre, message = FALSE}
20icc_genre <- icc %>%
21 expand_grid(genre) %>%
22 mutate(vc = paste0("iccGenre=", genre)) %>%
23 rowwise() %>%
24 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
25
26icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
27 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
28 theme_ids() +
Marc Kupietz07645fb2023-06-07 11:31:07 +020029 scale_fill_ids() +
30 geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")
Marc Kupietz6e21b102023-06-02 18:04:04 +020031
32```
33
34## Composition by date of publication
35
36
Marc Kupietz985e8932023-06-07 17:48:59 +020037```{r composition_by_pubdate, message=F, warning=F}
38year <- c(1986:2023)
Marc Kupietz6e21b102023-06-02 18:04:04 +020039
40icc_year <- icc %>%
41 expand_grid(year) %>%
42 mutate(vc = paste0("pubDate in ", year)) %>%
43 rowwise() %>%
44 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
45
46icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
Marc Kupietz985e8932023-06-07 17:48:59 +020047 # geom_smooth(se=F, span=0.25) +
48 xlim(1990, 2023) +
49 ylim(0, NA) +
50 stat_smooth(
51 geom = 'area', method = 'loess', span = 1/4,
52 alpha = 0.1) +
53 # geom_area(alpha=0.1, position = "identity") +
54 scale_fill_ids() + scale_colour_ids() +
Marc Kupietz6e21b102023-06-02 18:04:04 +020055 scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
56 theme_ids()
57```
58
59## Part-of-Speech proportions
60
61```{r pos_proportions}
62POS_tag <- c(
63 "ADJ", "ADP",# "PUNCT",
64 "ADV", "AUX", # "SYM",
Marc Kupietzddda0282023-06-07 17:48:37 +020065 # "INTJ",
66 "CCONJ", # "X",
Marc Kupietz6e21b102023-06-02 18:04:04 +020067 "NOUN", "DET",
68 "PROPN", #"NUM",
69 "VERB", #"PART",
70 "PRON",
71 "SCONJ"
72 )
73
74icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
75 rowwise() %>%
76 mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
77
78icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
79 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
Marc Kupietzd5540e92023-06-07 17:48:01 +020080 scale_fill_ids() + scale_color_ids() +
Marc Kupietz6e21b102023-06-02 18:04:04 +020081 theme_ids(base_size = 12) +
Marc Kupietzd5540e92023-06-07 17:48:01 +020082 geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")
Marc Kupietz6e21b102023-06-02 18:04:04 +020083```
84
85# Pilot study: Identification of Light Verb Constructions with *take*
86
87```{r prepare_ca, output=FALSE, message=FALSE}
88
89
90
91
92```
93
94## English: *take*
95
96```{r take_icc, echo=TRUE, message=FALSE}
97take_ca_icc <-
98 collocationAnalysis(
99 icc_con("eng"),
100 "focus({[ud/l=take]} [ud/p=NOUN])",
101 leftContextSize = 0,
102 rightContextSize = 1,
103 minOccur = 2,
104 addExamples = T
105 )
106
107take_ca_icc %>% show_table()
108```
109
Marc Kupietzfdac64b2023-06-13 08:32:37 +0200110### For comparison based on English Wikipedia
111#### (Snapshot from 2015 with 2.4 billion words, see [here](https://www.ids-mannheim.de/digspra/kl/projekte/korpora/verfuegbarkeit/))
112
113```{r take_wpe, echo=TRUE}
114if (file.exists("../data/take_ca_wpe")) {
Marc Kupietzd52056a2023-06-26 20:38:03 +0200115 take_ca_wpe_i <- readRDS("../data/take_ca_wpe")
Marc Kupietzfdac64b2023-06-13 08:32:37 +0200116} else {
117wpe <- new("KorAPConnection", "https://korap.ids-mannheim.de/instance/english", verbose=T)
Marc Kupietzd52056a2023-06-26 20:38:03 +0200118take_ca_wpe_i <-
Marc Kupietzfdac64b2023-06-13 08:32:37 +0200119 collocationAnalysis(
120 wpe,
121 "focus({[tt/l=take]} [tt/p=NN])",
122 leftContextSize = 0,
123 rightContextSize = 1,
Marc Kupietzd52056a2023-06-26 20:38:03 +0200124 ignoreCollocateCase = TRUE,
Marc Kupietzfdac64b2023-06-13 08:32:37 +0200125 minOccur = 5,
126 addExamples = T
127 )
128}
Marc Kupietzd52056a2023-06-26 20:38:03 +0200129take_ca_wpe_i %>% show_table(max=10000)
Marc Kupietzfdac64b2023-06-13 08:32:37 +0200130```
131
Marc Kupietz6e21b102023-06-02 18:04:04 +0200132## German: *nehmen*
133
134```{r nehmen_icc, echo=TRUE}
135nehmen_ca_icc <-
136 collocationAnalysis(
137 icc_con("ger"),
138 "focus([tt/p=NN] {[tt/l=nehmen]})",
139 leftContextSize = 1,
140 rightContextSize = 0,
141 minOccur = 2,
142 addExamples = T
143 )
144nehmen_ca_icc %>% show_table()
145```
146
147### For comparison based on the whole DeReKo
148
149```{r nehmen_dereko}
150nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
Marc Kupietzd52056a2023-06-26 20:38:03 +0200151nehmen_ca_dereko %>% show__full_table()
Marc Kupietz6e21b102023-06-02 18:04:04 +0200152```
153
154## Norwegian: *ta*
155
Marc Kupietzd75a1792023-06-07 17:47:23 +0200156```{r ta_icc, echo=T, message=FALSE}
Marc Kupietz6e21b102023-06-02 18:04:04 +0200157ta_ca_icc <-
158 collocationAnalysis(
159 icc_con("nor"),
160 "focus({[ud/l=ta]} [ud/p=NOUN])",
161 leftContextSize = 0,
162 rightContextSize = 1,
163 minOccur = 2,
164 addExamples = T
165 )
166ta_ca_icc %>% show_table()
167```
168