blob: be15f0be0397fc4fee0412f2aa19b493770d74d5 [file] [log] [blame]
---
title: "News from the International Comparable Corpus"
subtitle: "First launch of ICC written"
date: "`r Sys.Date()`"
author:
- name: Marc Kupietz
affil: 1
- name: Adrien Barbaresi
affil: 2
- name: Anna Cermakova
affil: 3
- name: Małgorzata Czachor
affil: 4
- name: Nils Diewald
affil: 1
- name: Jarle Ebeling
affil: 5
- name: Rafał L. Górski
affil: 4
- name: John Kirk
affil: 6
- name: Michal Křen
affil: 3
- name: Harald Lüngen
affil: 1
- name: Eliza Margaretha
affil: 1
- name: Signe Oksefjell Ebeling
affil: 5
- name: Mícheál Ó Meachair
affil: 7
- name: Ines Pisetta
affil: 1
- name: Elaine Uí Dhonnchadha
affil: 8
- name: Friedemann Vogel
affil: 9
- name: Rebecca Wilm
affil: 1
- name: Jiajin Xu
affil: 10
- name: Rameela Yaddehige
affil: 1
affiliation:
- num: 1
address: IDS Mannheim
- num: 2
address: BBAW Berlin
- num: 3
address: Charles University
- num: 4
address: Polish Academy of Sciences
- num: 5
address: University of Oslo
- num: 6
address: University of Vienna
- num: 7
address: Dublin City University
- num: 8
address: Trinity College Dublin
- num: 9
address: University of Siegen
- num: 10
address: Beijing Foreign Studies University
logoleft_name: "../Figures/ICC_COL.svg"
author_textsize: "32pt"
output:
posterdown::posterdown_ids
---
```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
source("common.R")
```
# ICC aims & charcteristics
* open initiative
* to improve the empirical basis for contrastive linguistics
* by compiling comparable corpora for many languages
* and making them as freely available as possible
* also by providing tools to query and analyse them
* mostly based on existing corpora
* mimics the composition of ICE
# Current alpha launch
## Composition of parts
### By ICC genre
```{r composition_by_genre, message = FALSE, fig.width=14, fig.height=10, out.width = "100%"}
icc_genre <- icc %>%
expand_grid(genre) %>%
mutate(vc = paste0("iccGenre=", genre)) %>%
rowwise() %>%
mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
theme_ids(base_size = 24) +
theme(
axis.title.x = element_text(size = rel(1.5), face = "bold"),
axis.title.y = element_text(size = rel(1.5), face = "bold"),
axis.text = element_text(size = rel(0.70)),
legend.title = element_text(size = rel(0.85), face = "bold"),
legend.text = element_text(size = rel(1))) +
scale_fill_ids() +
geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed")
```
### By date of publication
```{r composition_by_pubdate, message=F, warning=F, fig.width=14, fig.height=7, out.width = "100%"}
year <- c(1986:2023)
icc_year <- icc %>%
expand_grid(year) %>%
mutate(vc = paste0("pubDate in ", year)) %>%
rowwise() %>%
mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
# geom_smooth(se=F, span=0.25) +
xlim(1990, 2023) +
ylim(0, NA) +
stat_smooth(
geom = 'area', method = 'loess', span = 1/4,
alpha = 0.1) +
# geom_area(alpha=0.1, position = "identity") +
scale_fill_ids() + scale_colour_ids() +
scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
theme_ids(base_size=24) +
theme(
axis.title.x = element_text(size = rel(1.5), face = "bold"),
axis.title.y = element_text(size = rel(1.5), face = "bold"),
axis.text = element_text(size = rel(1)),
legend.title = element_text(size = rel(1), face = "bold"),
legend.text = element_text(size = rel(1)))
```
### Part-of-Speech proportions
```{r pos_proportions, fig.width=14, fig.height=10, out.width = "100%"}
POS_tag <- c(
"ADJ", "ADP",# "PUNCT",
"ADV", "AUX", # "SYM",
# "INTJ",
"CCONJ", # "X",
"NOUN", "DET",
"PROPN", #"NUM",
"VERB", #"PART",
"PRON",
"SCONJ"
)
icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
rowwise() %>%
mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
scale_fill_ids() + scale_color_ids() +
theme_ids(base_size=24) +
theme(
axis.title.x = element_text(size = rel(1.5), face = "bold"),
axis.title.y = element_text(size = rel(1.5), face = "bold"),
axis.text = element_text(size = rel(1)),
legend.title = element_text(size = rel(1), face = "bold"),
legend.text = element_text(size = rel(1))) +
geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed")
```
# Identification of Light Verb Constructions with *take*
## English: *take*
```{r take_icc, echo=TRUE, message=FALSE}
take_ca_icc <-
collocationAnalysis(
icc_con("eng"),
"focus({[ud/l=take]} [ud/p=NOUN])",
leftContextSize = 0,
rightContextSize = 1,
minOccur = 2,
addExamples = T
)
take_ca_icc %>% show_table()
```