blob: 8a7a3beb023f9306e8bb02f7406d04f5079b496f [file] [log] [blame]
Marc Kupietzf4881122024-12-17 14:55:39 +01001library(RKorAPClient)
Marc Kupietzf4881122024-12-17 14:55:39 +01002library(tidyverse)
3
Marc Kupietz617266d2025-02-27 10:43:07 +01004kco <- KorAPConnection(verbose = TRUE)
Marc Kupietzf4881122024-12-17 14:55:39 +01005
6set.seed(7)
7
8get_relative_positions_sample <- function(kco, query, sampleSize = 400) {
9 res <- corpusQuery(kco, query)
10 res <- fetchNext(res, maxFetch = sampleSize, randomizePageOrder = TRUE)
11 matches <- res@collectedMatches
12 matches <- matches %>%
13 mutate(
14 query = query,
15 vc = paste0('textSigle="', textSigle, '"'),
16 textSize = corpusStats(kco, vc, as.df = TRUE)$tokens,
17 relativeTextPosition = matchStart / textSize
18 )
19 cat("\n\n", query, ":\n")
20 print(summary(matches$relativeTextPosition))
21 cat("\n\n")
22 return(matches)
23}
24
25df <- c(
26 "anfangs/i",
27 "zuguterletzt/i",
28 "zun\u00e4chst/i", # it is still necessary to encode non ascii characters in R package demos
Marc Kupietza8c40f42025-06-24 15:49:52 +020029 "zuallererst/i",
Marc Kupietzf4881122024-12-17 14:55:39 +010030 "zuerst/i",
31 "zuletzt/i",
32 "schlie\u00dflich/i"
33) %>%
34 map(~ get_relative_positions_sample(kco, .)) %>%
35 bind_rows()
36
37hc_data <- df %>%
38 group_by(query) %>%
39 summarise(
40 min = min(relativeTextPosition),
41 q1 = quantile(relativeTextPosition, 0.25),
42 median = median(relativeTextPosition),
43 q3 = quantile(relativeTextPosition, 0.75),
44 max = max(relativeTextPosition)
45 ) %>%
46 mutate(data = pmap(list(min, q1, median, q3, max), c)) %>%
47 select(query, data)
48
49hc <- highchart() %>%
50 hc_chart(type = "boxplot", inverted = TRUE) %>%
51 hc_xAxis(categories = hc_data$query) %>%
52 hc_yAxis(ceiling = 1, title = list(text = "Relative position in text")) %>%
53 hc_add_series(data = hc_data$data) %>%
54 hc_title(text = "Relative positions of some adverbs in DeReKo texts") %>%
55 hc_legend(enabled = FALSE)
56
57print(hc)