| Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 1 | --- | 
 | 2 | title: "ICC Written Launch" | 
 | 3 | output: | 
 | 4 |   html_document: | 
 | 5 |     css: style.css | 
 | 6 |     self_contained: yes | 
 | 7 | date: "`r Sys.Date()`" | 
 | 8 | --- | 
 | 9 |  | 
 | 10 | ```{r setup, include=FALSE, echo=FALSE, warning=FALSE} | 
 | 11 | knitr::opts_chunk$set(echo = FALSE, warnings = FALSE) | 
 | 12 | source("common.R") | 
 | 13 | ``` | 
 | 14 |  | 
 | 15 | # Actual composition of ICC parts | 
 | 16 |  | 
 | 17 | ## Composition by ICC genre | 
 | 18 |  | 
 | 19 | ```{r composition_by_genre, message = FALSE} | 
 | 20 | icc_genre <- icc %>% | 
 | 21 |   expand_grid(genre) %>% | 
 | 22 |   mutate(vc = paste0("iccGenre=", genre)) %>% | 
 | 23 |   rowwise() %>% | 
 | 24 |   mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens) | 
 | 25 |  | 
 | 26 | icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) + | 
 | 27 |   geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + | 
 | 28 |   theme_ids() + | 
| Marc Kupietz | 07645fb | 2023-06-07 11:31:07 +0200 | [diff] [blame] | 29 |   scale_fill_ids() + | 
 | 30 |   geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed") | 
| Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 31 |  | 
 | 32 | ``` | 
 | 33 |  | 
 | 34 | ## Composition by date of publication | 
 | 35 |  | 
 | 36 |  | 
| Marc Kupietz | 985e893 | 2023-06-07 17:48:59 +0200 | [diff] [blame] | 37 | ```{r composition_by_pubdate, message=F, warning=F} | 
 | 38 | year <- c(1986:2023) | 
| Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 39 |  | 
 | 40 | icc_year <- icc %>% | 
 | 41 |   expand_grid(year) %>% | 
 | 42 |   mutate(vc = paste0("pubDate in ", year)) %>% | 
 | 43 |   rowwise() %>% | 
 | 44 |   mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens) | 
 | 45 |  | 
 | 46 | icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) + | 
| Marc Kupietz | 985e893 | 2023-06-07 17:48:59 +0200 | [diff] [blame] | 47 |   # geom_smooth(se=F, span=0.25) + | 
 | 48 |   xlim(1990, 2023) + | 
 | 49 |   ylim(0, NA) + | 
 | 50 |   stat_smooth( | 
 | 51 |         geom = 'area', method = 'loess', span = 1/4, | 
 | 52 |         alpha = 0.1) + | 
 | 53 |   # geom_area(alpha=0.1,  position = "identity") + | 
 | 54 |   scale_fill_ids() + scale_colour_ids() +  | 
| Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 55 |   scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + | 
 | 56 |   theme_ids() | 
 | 57 | ``` | 
 | 58 |  | 
 | 59 | ## Part-of-Speech proportions | 
 | 60 |  | 
 | 61 | ```{r pos_proportions} | 
 | 62 | POS_tag <- c( | 
 | 63 |   "ADJ", 	"ADP",#	"PUNCT", | 
 | 64 |   "ADV",	"AUX",	# "SYM", | 
| Marc Kupietz | ddda028 | 2023-06-07 17:48:37 +0200 | [diff] [blame] | 65 |   # "INTJ", | 
 | 66 |   "CCONJ", #	"X", | 
| Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 67 |   "NOUN",	"DET", | 
 | 68 |   "PROPN",	#"NUM", | 
 | 69 |   "VERB",	#"PART", | 
 | 70 |   "PRON", | 
 | 71 |   "SCONJ" | 
 | 72 |   ) | 
 | 73 |  | 
 | 74 | icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>% | 
 | 75 |   rowwise() %>% | 
 | 76 |   mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f) | 
 | 77 |  | 
 | 78 | icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) + | 
 | 79 |   geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + | 
| Marc Kupietz | d5540e9 | 2023-06-07 17:48:01 +0200 | [diff] [blame] | 80 |   scale_fill_ids() + scale_color_ids() + | 
| Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 81 |   theme_ids(base_size = 12) + | 
| Marc Kupietz | d5540e9 | 2023-06-07 17:48:01 +0200 | [diff] [blame] | 82 |   geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed") | 
| Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 83 | ``` | 
 | 84 |  | 
 | 85 | # Pilot study: Identification of Light Verb Constructions with *take* | 
 | 86 |  | 
 | 87 | ```{r prepare_ca, output=FALSE, message=FALSE} | 
 | 88 |  | 
 | 89 |  | 
 | 90 |  | 
 | 91 |  | 
 | 92 | ``` | 
 | 93 |  | 
 | 94 | ## English: *take* | 
 | 95 |  | 
 | 96 | ```{r take_icc, echo=TRUE, message=FALSE} | 
 | 97 | take_ca_icc <- | 
 | 98 |   collocationAnalysis( | 
 | 99 |     icc_con("eng"), | 
 | 100 |     "focus({[ud/l=take]} [ud/p=NOUN])", | 
 | 101 |     leftContextSize = 0, | 
 | 102 |     rightContextSize = 1, | 
 | 103 |     minOccur = 2, | 
 | 104 |     addExamples = T | 
 | 105 |   ) | 
 | 106 |  | 
 | 107 | take_ca_icc %>% show_table() | 
 | 108 | ``` | 
 | 109 |  | 
| Marc Kupietz | fdac64b | 2023-06-13 08:32:37 +0200 | [diff] [blame] | 110 | ### For comparison based on English Wikipedia | 
 | 111 | #### (Snapshot from 2015 with 2.4 billion words, see [here](https://www.ids-mannheim.de/digspra/kl/projekte/korpora/verfuegbarkeit/)) | 
 | 112 |  | 
 | 113 | ```{r take_wpe, echo=TRUE} | 
 | 114 | if (file.exists("../data/take_ca_wpe")) { | 
| Marc Kupietz | d52056a | 2023-06-26 20:38:03 +0200 | [diff] [blame] | 115 |   take_ca_wpe_i <- readRDS("../data/take_ca_wpe") | 
| Marc Kupietz | fdac64b | 2023-06-13 08:32:37 +0200 | [diff] [blame] | 116 | } else { | 
 | 117 | wpe <- new("KorAPConnection", "https://korap.ids-mannheim.de/instance/english", verbose=T) | 
| Marc Kupietz | d52056a | 2023-06-26 20:38:03 +0200 | [diff] [blame] | 118 | take_ca_wpe_i <- | 
| Marc Kupietz | fdac64b | 2023-06-13 08:32:37 +0200 | [diff] [blame] | 119 |   collocationAnalysis( | 
 | 120 |     wpe, | 
 | 121 |     "focus({[tt/l=take]} [tt/p=NN])", | 
 | 122 |     leftContextSize = 0, | 
 | 123 |     rightContextSize = 1, | 
| Marc Kupietz | d52056a | 2023-06-26 20:38:03 +0200 | [diff] [blame] | 124 |     ignoreCollocateCase = TRUE, | 
| Marc Kupietz | fdac64b | 2023-06-13 08:32:37 +0200 | [diff] [blame] | 125 |     minOccur = 5, | 
 | 126 |     addExamples = T | 
 | 127 |   ) | 
 | 128 | } | 
| Marc Kupietz | d52056a | 2023-06-26 20:38:03 +0200 | [diff] [blame] | 129 | take_ca_wpe_i %>% show_table(max=10000) | 
| Marc Kupietz | fdac64b | 2023-06-13 08:32:37 +0200 | [diff] [blame] | 130 | ``` | 
 | 131 |  | 
| Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 132 | ## German: *nehmen* | 
 | 133 |  | 
 | 134 | ```{r nehmen_icc, echo=TRUE} | 
 | 135 | nehmen_ca_icc <- | 
 | 136 |   collocationAnalysis( | 
 | 137 |     icc_con("ger"), | 
 | 138 |     "focus([tt/p=NN] {[tt/l=nehmen]})", | 
 | 139 |     leftContextSize = 1, | 
 | 140 |     rightContextSize = 0, | 
 | 141 |     minOccur = 2, | 
 | 142 |     addExamples = T | 
 | 143 |   ) | 
 | 144 | nehmen_ca_icc %>% show_table() | 
 | 145 | ``` | 
 | 146 |  | 
 | 147 | ### For comparison based on the whole DeReKo | 
 | 148 |  | 
 | 149 | ```{r nehmen_dereko} | 
 | 150 | nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples") | 
| Marc Kupietz | d52056a | 2023-06-26 20:38:03 +0200 | [diff] [blame] | 151 | nehmen_ca_dereko %>% show__full_table() | 
| Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 152 | ``` | 
 | 153 |  | 
 | 154 | ## Norwegian: *ta* | 
 | 155 |  | 
| Marc Kupietz | d75a179 | 2023-06-07 17:47:23 +0200 | [diff] [blame] | 156 | ```{r ta_icc, echo=T, message=FALSE} | 
| Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 157 | ta_ca_icc <- | 
 | 158 |   collocationAnalysis( | 
 | 159 |     icc_con("nor"), | 
 | 160 |     "focus({[ud/l=ta]} [ud/p=NOUN])", | 
 | 161 |     leftContextSize = 0, | 
 | 162 |     rightContextSize = 1, | 
 | 163 |     minOccur = 2, | 
 | 164 |     addExamples = T | 
 | 165 |   ) | 
 | 166 | ta_ca_icc %>% show_table() | 
 | 167 | ``` | 
 | 168 |  |