| --- |
| title: "News from the International Comparable Corpus" |
| subtitle: "First launch of ICC written" |
| date: "`r Sys.Date()`" |
| author: |
| - name: Marc Kupietz |
| affil: 1 |
| - name: Adrien Barbaresi |
| affil: 2 |
| - name: Anna Čermáková |
| affil: 3 |
| - name: Małgorzata Czachor |
| affil: 4 |
| - name: Nils Diewald |
| affil: 1 |
| - name: Jarle Ebeling |
| affil: 5 |
| - name: Rafał L. Górski |
| affil: 4 |
| - name: John Kirk |
| affil: 6 |
| - name: Michal Křen |
| affil: 3 |
| - name: Harald Lüngen |
| affil: 1 |
| - name: Eliza Margaretha |
| affil: 1 |
| - name: Signe Oksefjell Ebeling |
| affil: 5 |
| - name: Mícheál Ó Meachair |
| affil: 7 |
| - name: Ines Pisetta |
| affil: 1 |
| - name: Elaine Uí Dhonnchadha |
| affil: 8 |
| - name: Friedemann Vogel |
| affil: 9 |
| - name: Rebecca Wilm |
| affil: 1 |
| - name: Jiajin Xu |
| affil: 10 |
| - name: Rameela Yaddehige |
| affil: 1 |
| affiliation: |
| - num: 1 |
| address: IDS Mannheim |
| - num: 2 |
| address: BBAW Berlin |
| - num: 3 |
| address: Charles University |
| - num: 4 |
| address: Polish Academy of Sciences |
| - num: 5 |
| address: University of Oslo |
| - num: 6 |
| address: University of Vienna |
| - num: 7 |
| address: Dublin City University |
| - num: 8 |
| address: Trinity College Dublin |
| - num: 9 |
| address: University of Siegen |
| - num: 10 |
| address: Beijing Foreign Studies University |
| |
| |
| logoleft_name: "../Figures/ICC_COL.svg" |
| author_textsize: "32pt" |
| |
| contact: |
| qrlink: > |
| `r posterdown::qrlink("https://korap.ids-mannheim.de/instance/icc")` |
| |
| output: |
| posterdown::posterdown_ids: |
| self_contained: false |
| keep_md: true |
| |
| bibliography: ../tex/references.bib |
| csl: "https://raw.githubusercontent.com/ICLC-10/Zotero/master/styles/ICLC-10.csl" |
| --- |
| |
| ```{r setup, include=FALSE, echo=FALSE, warning=FALSE} |
| knitr::opts_chunk$set(dev = 'svg', echo = FALSE, warnings = FALSE) |
| source("common.R") |
| ``` |
| # ICC aims & charcteristics |
| * make available comparable corpora of many languages for contrastive linguistic research [@cermakova_international_2021] |
| * mostly based on existing corpora |
| * ICC has a pre-defined “balanced” composition |
| * based on the one of the ICE [@greenbaum_comparing_1996] |
| |
| # Current alpha launch |
| |
| ## Composition of parts |
| ### By ICC genre |
| |
| ```{r composition_by_genre, message = FALSE, fig.width=14, fig.height=10, out.width = "100%"} |
| icc_genre <- icc %>% |
| expand_grid(genre) %>% |
| mutate(vc = paste0("iccGenre=", genre)) %>% |
| rowwise() %>% |
| mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens) |
| |
| icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) + |
| geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + |
| theme_ids(base_size = 24) + |
| theme( |
| axis.title.x = element_text(size = rel(1.5), face = "bold"), |
| axis.title.y = element_text(size = rel(1.5), face = "bold"), |
| axis.text = element_text(size = rel(0.70)), |
| legend.title = element_text(size = rel(0.85), face = "bold"), |
| legend.text = element_text(size = rel(1))) + |
| scale_fill_ids() + |
| geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed") |
| |
| ``` |
| |
| ### By date of publication |
| |
| |
| ```{r composition_by_pubdate, message=F, warning=F, fig.width=14, fig.height=7, out.width = "100%"} |
| year <- c(1986:2023) |
| |
| icc_year <- icc %>% |
| expand_grid(year) %>% |
| mutate(vc = paste0("pubDate in ", year)) %>% |
| rowwise() %>% |
| mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens) |
| |
| icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) + |
| # geom_smooth(se=F, span=0.25) + |
| xlim(1990, 2023) + |
| ylim(0, NA) + |
| stat_smooth( |
| geom = 'area', method = 'loess', span = 1/4, |
| alpha = 0.1) + |
| # geom_area(alpha=0.1, position = "identity") + |
| scale_fill_ids() + scale_colour_ids() + |
| scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + |
| theme_ids(base_size=24) + |
| theme( |
| axis.title.x = element_text(size = rel(1.5), face = "bold"), |
| axis.title.y = element_text(size = rel(1.5), face = "bold"), |
| axis.text = element_text(size = rel(1)), |
| legend.title = element_text(size = rel(1), face = "bold"), |
| legend.text = element_text(size = rel(1))) |
| ``` |
| |
| ### Part-of-Speech proportions |
| |
| ```{r pos_proportions, fig.width=14, fig.height=10, out.width = "100%"} |
| POS_tag <- c( |
| "ADJ", "ADP",# "PUNCT", |
| "ADV", "AUX", # "SYM", |
| # "INTJ", |
| "CCONJ", # "X", |
| "NOUN", "DET", |
| "PROPN", #"NUM", |
| "VERB", #"PART", |
| "PRON", |
| "SCONJ" |
| ) |
| |
| icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>% |
| rowwise() %>% |
| mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f) |
| |
| icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) + |
| geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + |
| scale_fill_ids() + scale_color_ids() + |
| theme_ids(base_size=24) + |
| theme( |
| axis.title.x = element_text(size = rel(1.5), face = "bold"), |
| axis.title.y = element_text(size = rel(1.5), face = "bold"), |
| axis.text = element_text(size = rel(1)), |
| legend.title = element_text(size = rel(1), face = "bold"), |
| legend.text = element_text(size = rel(1))) + |
| geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed") |
| ``` |
| |
| # Identification of Light Verb Constructions with *take* |
| |
| |
| ## English: *take* |
| |
| ```{r take_icc, echo=TRUE, message=FALSE} |
| take_ca_icc <- |
| collocationAnalysis( |
| icc_con("eng"), |
| "focus({[ud/l=take]} [ud/p=NOUN])", |
| leftContextSize = 0, |
| rightContextSize = 1, |
| minOccur = 2, |
| addExamples = T |
| ) |
| |
| take_ca_icc %>% show_table() |
| ``` |
| |
| # References |
| |
| |