blob: edd999bd73225d0464d14a8d60ef7a61b0510f01 [file] [log] [blame]
Marc Kupietzafce9c12023-06-13 09:18:53 +02001---
2title: "News from the International Comparable Corpus"
3subtitle: "First launch of ICC written"
4date: "`r Sys.Date()`"
5author:
6 - name: Marc Kupietz
7 affil: 1
8 - name: Adrien Barbaresi
9 affil: 2
Marc Kupietzbcde0b62023-06-14 14:22:35 +020010 - name: Anna Čermáková
Marc Kupietzafce9c12023-06-13 09:18:53 +020011 affil: 3
12 - name: Małgorzata Czachor
13 affil: 4
14 - name: Nils Diewald
15 affil: 1
16 - name: Jarle Ebeling
17 affil: 5
18 - name: Rafał L. Górski
19 affil: 4
20 - name: John Kirk
21 affil: 6
22 - name: Michal Křen
23 affil: 3
24 - name: Harald Lüngen
25 affil: 1
26 - name: Eliza Margaretha
27 affil: 1
28 - name: Signe Oksefjell Ebeling
29 affil: 5
30 - name: Mícheál Ó Meachair
31 affil: 7
32 - name: Ines Pisetta
33 affil: 1
34 - name: Elaine Uí Dhonnchadha
35 affil: 8
36 - name: Friedemann Vogel
37 affil: 9
38 - name: Rebecca Wilm
39 affil: 1
40 - name: Jiajin Xu
41 affil: 10
42 - name: Rameela Yaddehige
43 affil: 1
44affiliation:
45 - num: 1
46 address: IDS Mannheim
47 - num: 2
48 address: BBAW Berlin
49 - num: 3
50 address: Charles University
51 - num: 4
52 address: Polish Academy of Sciences
53 - num: 5
54 address: University of Oslo
55 - num: 6
56 address: University of Vienna
57 - num: 7
58 address: Dublin City University
59 - num: 8
60 address: Trinity College Dublin
61 - num: 9
62 address: University of Siegen
63 - num: 10
64 address: Beijing Foreign Studies University
65
66
67logoleft_name: "../Figures/ICC_COL.svg"
68author_textsize: "32pt"
69
70output:
Marc Kupietz493ef7d2023-06-14 07:19:10 +020071 posterdown::posterdown_ids
Marc Kupietzbcde0b62023-06-14 14:22:35 +020072
73bibliography: ../tex/references.bib
74csl: ids.csl
Marc Kupietzafce9c12023-06-13 09:18:53 +020075---
76
77```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
Marc Kupietz48d2b522023-06-14 12:31:06 +020078knitr::opts_chunk$set(dev = 'svg', echo = FALSE, warnings = FALSE)
Marc Kupietzafce9c12023-06-13 09:18:53 +020079source("common.R")
80```
81# ICC aims & charcteristics
82
Marc Kupietzbcde0b62023-06-14 14:22:35 +020083* open initiative [@cermakova_international_2021]
Marc Kupietzafce9c12023-06-13 09:18:53 +020084* to improve the empirical basis for contrastive linguistics
85* by compiling comparable corpora for many languages
86* and making them as freely available as possible
87* also by providing tools to query and analyse them
88* mostly based on existing corpora
89* mimics the composition of ICE
90
91# Current alpha launch
92
93## Composition of parts
94### By ICC genre
95
96```{r composition_by_genre, message = FALSE, fig.width=14, fig.height=10, out.width = "100%"}
97icc_genre <- icc %>%
98 expand_grid(genre) %>%
99 mutate(vc = paste0("iccGenre=", genre)) %>%
100 rowwise() %>%
101 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
102
103icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
104 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
105 theme_ids(base_size = 24) +
106 theme(
107 axis.title.x = element_text(size = rel(1.5), face = "bold"),
108 axis.title.y = element_text(size = rel(1.5), face = "bold"),
109 axis.text = element_text(size = rel(0.70)),
110 legend.title = element_text(size = rel(0.85), face = "bold"),
111 legend.text = element_text(size = rel(1))) +
112 scale_fill_ids() +
113 geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed")
114
115```
116
117### By date of publication
118
119
120```{r composition_by_pubdate, message=F, warning=F, fig.width=14, fig.height=7, out.width = "100%"}
121year <- c(1986:2023)
122
123icc_year <- icc %>%
124 expand_grid(year) %>%
125 mutate(vc = paste0("pubDate in ", year)) %>%
126 rowwise() %>%
127 mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
128
129icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
130 # geom_smooth(se=F, span=0.25) +
131 xlim(1990, 2023) +
132 ylim(0, NA) +
133 stat_smooth(
134 geom = 'area', method = 'loess', span = 1/4,
135 alpha = 0.1) +
136 # geom_area(alpha=0.1, position = "identity") +
137 scale_fill_ids() + scale_colour_ids() +
138 scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
139 theme_ids(base_size=24) +
140 theme(
141 axis.title.x = element_text(size = rel(1.5), face = "bold"),
142 axis.title.y = element_text(size = rel(1.5), face = "bold"),
143 axis.text = element_text(size = rel(1)),
144 legend.title = element_text(size = rel(1), face = "bold"),
145 legend.text = element_text(size = rel(1)))
146```
147
148### Part-of-Speech proportions
149
150```{r pos_proportions, fig.width=14, fig.height=10, out.width = "100%"}
151POS_tag <- c(
152 "ADJ", "ADP",# "PUNCT",
153 "ADV", "AUX", # "SYM",
154 # "INTJ",
155 "CCONJ", # "X",
156 "NOUN", "DET",
157 "PROPN", #"NUM",
158 "VERB", #"PART",
159 "PRON",
160 "SCONJ"
161 )
162
163icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
164 rowwise() %>%
165 mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
166
167icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
168 geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
169 scale_fill_ids() + scale_color_ids() +
170 theme_ids(base_size=24) +
171 theme(
172 axis.title.x = element_text(size = rel(1.5), face = "bold"),
173 axis.title.y = element_text(size = rel(1.5), face = "bold"),
174 axis.text = element_text(size = rel(1)),
175 legend.title = element_text(size = rel(1), face = "bold"),
176 legend.text = element_text(size = rel(1))) +
177 geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed")
178```
179
180# Identification of Light Verb Constructions with *take*
181
182
183## English: *take*
184
185```{r take_icc, echo=TRUE, message=FALSE}
186take_ca_icc <-
187 collocationAnalysis(
188 icc_con("eng"),
189 "focus({[ud/l=take]} [ud/p=NOUN])",
190 leftContextSize = 0,
191 rightContextSize = 1,
192 minOccur = 2,
193 addExamples = T
194 )
195
196take_ca_icc %>% show_table()
197```
198
Marc Kupietzbcde0b62023-06-14 14:22:35 +0200199# References
200
Marc Kupietzafce9c12023-06-13 09:18:53 +0200201