---
title: "ICC Written Launch"
output:
  html_document:
    css: style.css
    self_contained: yes
date: "`r Sys.Date()`"
---

```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
source("common.R")
```

# Actual composition of ICC parts

## Composition by ICC genre

```{r composition_by_genre, message = FALSE}
icc_genre <- icc %>%
  expand_grid(genre) %>%
  mutate(vc = paste0("iccGenre=", genre)) %>%
  rowwise() %>%
  mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)

icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
  geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
  theme_ids() +
  scale_fill_ids() +
  geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")

```

## Composition by date of publication


```{r composition_by_pubdate, message=F, warning=F}
year <- c(1986:2023)

icc_year <- icc %>%
  expand_grid(year) %>%
  mutate(vc = paste0("pubDate in ", year)) %>%
  rowwise() %>%
  mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)

icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
  # geom_smooth(se=F, span=0.25) +
  xlim(1990, 2023) +
  ylim(0, NA) +
  stat_smooth(
        geom = 'area', method = 'loess', span = 1/4,
        alpha = 0.1) +
  # geom_area(alpha=0.1,  position = "identity") +
  scale_fill_ids() + scale_colour_ids() + 
  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
  theme_ids()
```

## Part-of-Speech proportions

```{r pos_proportions}
POS_tag <- c(
  "ADJ", 	"ADP",#	"PUNCT",
  "ADV",	"AUX",	# "SYM",
  # "INTJ",
  "CCONJ", #	"X",
  "NOUN",	"DET",
  "PROPN",	#"NUM",
  "VERB",	#"PART",
  "PRON",
  "SCONJ"
  )

icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
  rowwise() %>%
  mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)

icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
  geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
  scale_fill_ids() + scale_color_ids() +
  theme_ids(base_size = 12) +
  geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")
```

# Pilot study: Identification of Light Verb Constructions with *take*

```{r prepare_ca, output=FALSE, message=FALSE}




```

## English: *take*

```{r take_icc, echo=TRUE, message=FALSE}
take_ca_icc <-
  collocationAnalysis(
    icc_con("eng"),
    "focus({[ud/l=take]} [ud/p=NOUN])",
    leftContextSize = 0,
    rightContextSize = 1,
    minOccur = 2,
    addExamples = T
  )

take_ca_icc %>% show_table()
```

### For comparison based on English Wikipedia
#### (Snapshot from 2015 with 2.4 billion words, see [here](https://www.ids-mannheim.de/digspra/kl/projekte/korpora/verfuegbarkeit/))

```{r take_wpe, echo=TRUE}
if (file.exists("../data/take_ca_wpe")) {
  take_ca_wpe <- readRDS("../data/take_ca_wpe")
} else {
wpe <- new("KorAPConnection", "https://korap.ids-mannheim.de/instance/english", verbose=T)
take_ca_wpe <-
  collocationAnalysis(
    wpe,
    "focus({[tt/l=take]} [tt/p=NN])",
    leftContextSize = 0,
    rightContextSize = 1,
    minOccur = 5,
    addExamples = T
  )
}
take_ca_wpe %>% show_table()
```

## German: *nehmen*

```{r nehmen_icc, echo=TRUE}
nehmen_ca_icc <-
  collocationAnalysis(
    icc_con("ger"),
    "focus([tt/p=NN] {[tt/l=nehmen]})",
    leftContextSize = 1,
    rightContextSize = 0,
    minOccur = 2,
    addExamples = T
  )
nehmen_ca_icc %>% show_table()
```

### For comparison based on the whole DeReKo

```{r nehmen_dereko}
nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
nehmen_ca_dereko %>% show_table()
```

## Norwegian: *ta*

```{r ta_icc, echo=T, message=FALSE}
ta_ca_icc <-
  collocationAnalysis(
    icc_con("nor"),
    "focus({[ud/l=ta]} [ud/p=NOUN])",
    leftContextSize = 0,
    rightContextSize = 1,
    minOccur = 2,
    addExamples = T
  )
ta_ca_icc %>% show_table()
```

