blob: c6bb39db66026291612deba547749b37629b96f1 [file] [log] [blame]
Marc Kupietzafce9c12023-06-13 09:18:53 +02001---
2title: "News from the International Comparable Corpus"
3subtitle: "First launch of ICC written"
4date: "`r Sys.Date()`"
5author:
6 - name: Marc Kupietz
7 affil: 1
8 - name: Adrien Barbaresi
9 affil: 2
Marc Kupietzbcde0b62023-06-14 14:22:35 +020010 - name: Anna Čermáková
Marc Kupietzafce9c12023-06-13 09:18:53 +020011 affil: 3
12 - name: Małgorzata Czachor
13 affil: 4
14 - name: Nils Diewald
15 affil: 1
16 - name: Jarle Ebeling
17 affil: 5
18 - name: Rafał L. Górski
19 affil: 4
20 - name: John Kirk
21 affil: 6
22 - name: Michal Křen
23 affil: 3
24 - name: Harald Lüngen
25 affil: 1
26 - name: Eliza Margaretha
27 affil: 1
28 - name: Signe Oksefjell Ebeling
29 affil: 5
30 - name: Mícheál Ó Meachair
31 affil: 7
32 - name: Ines Pisetta
33 affil: 1
34 - name: Elaine Uí Dhonnchadha
35 affil: 8
36 - name: Friedemann Vogel
37 affil: 9
38 - name: Rebecca Wilm
39 affil: 1
40 - name: Jiajin Xu
41 affil: 10
42 - name: Rameela Yaddehige
43 affil: 1
44affiliation:
45 - num: 1
46 address: IDS Mannheim
47 - num: 2
48 address: BBAW Berlin
49 - num: 3
50 address: Charles University
51 - num: 4
52 address: Polish Academy of Sciences
53 - num: 5
54 address: University of Oslo
55 - num: 6
56 address: University of Vienna
57 - num: 7
58 address: Dublin City University
59 - num: 8
60 address: Trinity College Dublin
61 - num: 9
62 address: University of Siegen
63 - num: 10
64 address: Beijing Foreign Studies University
65
66
67logoleft_name: "../Figures/ICC_COL.svg"
68author_textsize: "32pt"
69
Marc Kupietzfbd648c2023-06-24 12:31:45 +020070contact:
71 qrcode: icc_qrcode.svg
72
Marc Kupietzafce9c12023-06-13 09:18:53 +020073output:
Marc Kupietzfbd648c2023-06-24 12:31:45 +020074 posterdown::posterdown_ids:
75 self_contained: false
76 keep_md: true
Marc Kupietzbcde0b62023-06-14 14:22:35 +020077
78bibliography: ../tex/references.bib
79csl: ids.csl
Marc Kupietzafce9c12023-06-13 09:18:53 +020080---
81
82```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
Marc Kupietzfbd648c2023-06-24 12:31:45 +020083library(qrcode)
Marc Kupietz48d2b522023-06-14 12:31:06 +020084knitr::opts_chunk$set(dev = 'svg', echo = FALSE, warnings = FALSE)
Marc Kupietzafce9c12023-06-13 09:18:53 +020085source("common.R")
Marc Kupietzfbd648c2023-06-24 12:31:45 +020086generate_svg(qr_code("https://korap.ids-mannheim.de/instance/icc"), "icc_qrcode.svg")
Marc Kupietzafce9c12023-06-13 09:18:53 +020087```
88# ICC aims & charcteristics
89
Marc Kupietzbcde0b62023-06-14 14:22:35 +020090* open initiative [@cermakova_international_2021]
Marc Kupietzafce9c12023-06-13 09:18:53 +020091* to improve the empirical basis for contrastive linguistics
92* by compiling comparable corpora for many languages
93* and making them as freely available as possible
94* also by providing tools to query and analyse them
95* mostly based on existing corpora
96* mimics the composition of ICE
97
98# Current alpha launch
99
100## Composition of parts
101### By ICC genre
102
103```{r composition_by_genre, message = FALSE, fig.width=14, fig.height=10, out.width = "100%"}
104icc_genre <- icc %>%
105 expand_grid(genre) %>%
106 mutate(vc = paste0("iccGenre=", genre)) %>%
107 rowwise() %>%
108 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
109
110icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
111 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
112 theme_ids(base_size = 24) +
113 theme(
114 axis.title.x = element_text(size = rel(1.5), face = "bold"),
115 axis.title.y = element_text(size = rel(1.5), face = "bold"),
116 axis.text = element_text(size = rel(0.70)),
117 legend.title = element_text(size = rel(0.85), face = "bold"),
118 legend.text = element_text(size = rel(1))) +
119 scale_fill_ids() +
120 geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed")
121
122```
123
124### By date of publication
125
126
127```{r composition_by_pubdate, message=F, warning=F, fig.width=14, fig.height=7, out.width = "100%"}
128year <- c(1986:2023)
129
130icc_year <- icc %>%
131 expand_grid(year) %>%
132 mutate(vc = paste0("pubDate in ", year)) %>%
133 rowwise() %>%
134 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
135
136icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
137 # geom_smooth(se=F, span=0.25) +
138 xlim(1990, 2023) +
139 ylim(0, NA) +
140 stat_smooth(
141 geom = 'area', method = 'loess', span = 1/4,
142 alpha = 0.1) +
143 # geom_area(alpha=0.1, position = "identity") +
144 scale_fill_ids() + scale_colour_ids() +
145 scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
146 theme_ids(base_size=24) +
147 theme(
148 axis.title.x = element_text(size = rel(1.5), face = "bold"),
149 axis.title.y = element_text(size = rel(1.5), face = "bold"),
150 axis.text = element_text(size = rel(1)),
151 legend.title = element_text(size = rel(1), face = "bold"),
152 legend.text = element_text(size = rel(1)))
153```
154
155### Part-of-Speech proportions
156
157```{r pos_proportions, fig.width=14, fig.height=10, out.width = "100%"}
158POS_tag <- c(
159 "ADJ", "ADP",# "PUNCT",
160 "ADV", "AUX", # "SYM",
161 # "INTJ",
162 "CCONJ", # "X",
163 "NOUN", "DET",
164 "PROPN", #"NUM",
165 "VERB", #"PART",
166 "PRON",
167 "SCONJ"
168 )
169
170icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
171 rowwise() %>%
172 mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
173
174icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
175 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
176 scale_fill_ids() + scale_color_ids() +
177 theme_ids(base_size=24) +
178 theme(
179 axis.title.x = element_text(size = rel(1.5), face = "bold"),
180 axis.title.y = element_text(size = rel(1.5), face = "bold"),
181 axis.text = element_text(size = rel(1)),
182 legend.title = element_text(size = rel(1), face = "bold"),
183 legend.text = element_text(size = rel(1))) +
184 geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed")
185```
186
187# Identification of Light Verb Constructions with *take*
188
189
190## English: *take*
191
192```{r take_icc, echo=TRUE, message=FALSE}
193take_ca_icc <-
194 collocationAnalysis(
195 icc_con("eng"),
196 "focus({[ud/l=take]} [ud/p=NOUN])",
197 leftContextSize = 0,
198 rightContextSize = 1,
199 minOccur = 2,
200 addExamples = T
201 )
202
203take_ca_icc %>% show_table()
204```
205
Marc Kupietzbcde0b62023-06-14 14:22:35 +0200206# References
207
Marc Kupietzafce9c12023-06-13 09:18:53 +0200208