blob: be15f0be0397fc4fee0412f2aa19b493770d74d5 [file] [log] [blame]
Marc Kupietzafce9c12023-06-13 09:18:53 +02001---
2title: "News from the International Comparable Corpus"
3subtitle: "First launch of ICC written"
4date: "`r Sys.Date()`"
5author:
6 - name: Marc Kupietz
7 affil: 1
8 - name: Adrien Barbaresi
9 affil: 2
10 - name: Anna Cermakova
11 affil: 3
12 - name: Małgorzata Czachor
13 affil: 4
14 - name: Nils Diewald
15 affil: 1
16 - name: Jarle Ebeling
17 affil: 5
18 - name: Rafał L. Górski
19 affil: 4
20 - name: John Kirk
21 affil: 6
22 - name: Michal Křen
23 affil: 3
24 - name: Harald Lüngen
25 affil: 1
26 - name: Eliza Margaretha
27 affil: 1
28 - name: Signe Oksefjell Ebeling
29 affil: 5
30 - name: Mícheál Ó Meachair
31 affil: 7
32 - name: Ines Pisetta
33 affil: 1
34 - name: Elaine Uí Dhonnchadha
35 affil: 8
36 - name: Friedemann Vogel
37 affil: 9
38 - name: Rebecca Wilm
39 affil: 1
40 - name: Jiajin Xu
41 affil: 10
42 - name: Rameela Yaddehige
43 affil: 1
44affiliation:
45 - num: 1
46 address: IDS Mannheim
47 - num: 2
48 address: BBAW Berlin
49 - num: 3
50 address: Charles University
51 - num: 4
52 address: Polish Academy of Sciences
53 - num: 5
54 address: University of Oslo
55 - num: 6
56 address: University of Vienna
57 - num: 7
58 address: Dublin City University
59 - num: 8
60 address: Trinity College Dublin
61 - num: 9
62 address: University of Siegen
63 - num: 10
64 address: Beijing Foreign Studies University
65
66
67logoleft_name: "../Figures/ICC_COL.svg"
68author_textsize: "32pt"
69
70output:
Marc Kupietz493ef7d2023-06-14 07:19:10 +020071 posterdown::posterdown_ids
Marc Kupietzafce9c12023-06-13 09:18:53 +020072---
73
74```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
75knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
76source("common.R")
77```
78# ICC aims & charcteristics
79
80* open initiative
81* to improve the empirical basis for contrastive linguistics
82* by compiling comparable corpora for many languages
83* and making them as freely available as possible
84* also by providing tools to query and analyse them
85* mostly based on existing corpora
86* mimics the composition of ICE
87
88# Current alpha launch
89
90## Composition of parts
91### By ICC genre
92
93```{r composition_by_genre, message = FALSE, fig.width=14, fig.height=10, out.width = "100%"}
94icc_genre <- icc %>%
95 expand_grid(genre) %>%
96 mutate(vc = paste0("iccGenre=", genre)) %>%
97 rowwise() %>%
98 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
99
100icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
101 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
102 theme_ids(base_size = 24) +
103 theme(
104 axis.title.x = element_text(size = rel(1.5), face = "bold"),
105 axis.title.y = element_text(size = rel(1.5), face = "bold"),
106 axis.text = element_text(size = rel(0.70)),
107 legend.title = element_text(size = rel(0.85), face = "bold"),
108 legend.text = element_text(size = rel(1))) +
109 scale_fill_ids() +
110 geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed")
111
112```
113
114### By date of publication
115
116
117```{r composition_by_pubdate, message=F, warning=F, fig.width=14, fig.height=7, out.width = "100%"}
118year <- c(1986:2023)
119
120icc_year <- icc %>%
121 expand_grid(year) %>%
122 mutate(vc = paste0("pubDate in ", year)) %>%
123 rowwise() %>%
124 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
125
126icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
127 # geom_smooth(se=F, span=0.25) +
128 xlim(1990, 2023) +
129 ylim(0, NA) +
130 stat_smooth(
131 geom = 'area', method = 'loess', span = 1/4,
132 alpha = 0.1) +
133 # geom_area(alpha=0.1, position = "identity") +
134 scale_fill_ids() + scale_colour_ids() +
135 scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
136 theme_ids(base_size=24) +
137 theme(
138 axis.title.x = element_text(size = rel(1.5), face = "bold"),
139 axis.title.y = element_text(size = rel(1.5), face = "bold"),
140 axis.text = element_text(size = rel(1)),
141 legend.title = element_text(size = rel(1), face = "bold"),
142 legend.text = element_text(size = rel(1)))
143```
144
145### Part-of-Speech proportions
146
147```{r pos_proportions, fig.width=14, fig.height=10, out.width = "100%"}
148POS_tag <- c(
149 "ADJ", "ADP",# "PUNCT",
150 "ADV", "AUX", # "SYM",
151 # "INTJ",
152 "CCONJ", # "X",
153 "NOUN", "DET",
154 "PROPN", #"NUM",
155 "VERB", #"PART",
156 "PRON",
157 "SCONJ"
158 )
159
160icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
161 rowwise() %>%
162 mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
163
164icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
165 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
166 scale_fill_ids() + scale_color_ids() +
167 theme_ids(base_size=24) +
168 theme(
169 axis.title.x = element_text(size = rel(1.5), face = "bold"),
170 axis.title.y = element_text(size = rel(1.5), face = "bold"),
171 axis.text = element_text(size = rel(1)),
172 legend.title = element_text(size = rel(1), face = "bold"),
173 legend.text = element_text(size = rel(1))) +
174 geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed")
175```
176
177# Identification of Light Verb Constructions with *take*
178
179
180## English: *take*
181
182```{r take_icc, echo=TRUE, message=FALSE}
183take_ca_icc <-
184 collocationAnalysis(
185 icc_con("eng"),
186 "focus({[ud/l=take]} [ud/p=NOUN])",
187 leftContextSize = 0,
188 rightContextSize = 1,
189 minOccur = 2,
190 addExamples = T
191 )
192
193take_ca_icc %>% show_table()
194```
195
196