blob: 8ff8c3bf99b164db8cc2be380c0696fb5dba6d81 [file] [log] [blame]
Marc Kupietz6dd348c2023-05-03 08:26:58 +02001library(RKorAPClient)
2library(tidyverse)
3library(purrrlyr)
Marc Kupietz65eede52023-05-12 17:30:07 +02004library(httr2)
5library(httpuv)
6
7demo_kor_app_id = "773NHGM76N7P9b6rLfmpM4"
Marc Kupietz6dd348c2023-05-03 08:26:58 +02008
9# The challenge in searching gender variants with KorAP and DeReKo is that,
10# firstly, some characters used for gender marking, especially punctuation marks,
11# are interpreted and indexed as token boundaries and, secondly, punctuation
12# marks are currently not indexed in KorAP.
13#
14# The former is intentional with regard to a majority of use cases and with
Marc Kupietzadc86eb2023-05-10 19:49:15 +020015# regard to the reproducibility maxim (see Diewald/Kupietz/L\u00FCngen 2022).
Marc Kupietz6dd348c2023-05-03 08:26:58 +020016# The latter is a shortcoming in KorAP that will be remedied sooner or later
17# and that can be solved provisionally in the meantime with the help of the KorAP API.
18#
19# The following unravelPunctuationGenderCases function, for example, takes the
20# result of a frequencyQuery for two supposedly consecutive tokens and then looks more
21# closely into the KWIC snippets to see which non-indexed strings actually do appear
22# between these tokens and counts the frequencies of the variants that occur.
23
Marc Kupietz65eede52023-05-12 17:30:07 +020024unravelPunctuationGenderCases <- function(df, suffix = "innen", kco) {
Marc Kupietz6dd348c2023-05-03 08:26:58 +020025 if ( nrow(df) > 1) {
26 df %>%
27 dplyr::filter(totalResults > 0 & str_detect(query, paste0(" ", suffix))) %>%
Marc Kupietz65eede52023-05-12 17:30:07 +020028 by_row(unravelPunctuationGenderCases, kco = kco, .collate = "rows", .labels=FALSE) %>%
Marc Kupietz6dd348c2023-05-03 08:26:58 +020029 select(-.row) %>%
30 bind_rows(df %>% dplyr::filter(totalResults == 0 | ! str_detect(query, paste0(" ", suffix)))) %>%
31 tidyr::complete(query, nesting(vc, total), fill = list(totalResults = 0)) %>%
32 select(-f, -conf.low, -conf.high) %>%
33 RKorAPClient::ci() %>%
34 mutate(query = str_replace_all(query, '(^"|"$|[\\[\\]\\\\])', '')) %>%
35 mutate(query = str_replace_all(query, paste0('\\(', suffix), paste0('(', suffix, ')'))) %>%
36 filter(!str_detect(query, paste0("\\w ", suffix))) # remove "Nutzer innen"
37 } else {
38 q <- corpusQuery(kco, df$query, vc=df$vc, metadataOnly = FALSE) %>%
39 fetchAll()
40 cases <- q@collectedMatches$snippet %>%
41 str_replace_all(paste0(".*<mark>.*\\w(\\W+)", suffix, "</mark>.*"), "\\1") %>%
42 as_tibble() %>%
43 group_by(value) %>%
44 summarise(n = n())
45 df %>% uncount(nrow(cases)) %>%
46 mutate(query = str_replace(query, paste0(" (?=", suffix, ")"), cases$value), totalResults = cases$n)
47 }
48}
49
Marc Kupietz65eede52023-05-12 17:30:07 +020050oauthorizeDemo <- function(kco, app_id = demo_kor_app_id) {
51 if (is.null(kco@accessToken) || is.null(kco@welcome)) { # if access token is not set or invalid
52 kco@accessToken <- ( # request one
53 oauth_client(
54 id = app_id, # for the demo application
55 token_url = paste0(kco@apiUrl, "oauth2/token")
56 ) %>%
57 oauth_flow_auth_code(
58 scope = "search match_info",
59 auth_url = paste0(kco@KorAPUrl, "settings/oauth/authorize")
60 )
61 )$access_token
62 }
63 kco
64}
65
Marc Kupietz6dd348c2023-05-03 08:26:58 +020066plotPluralGenderVariants <- function(word = "Nutzer",
67 years = c(1995:2022),
68 as.alternatives = FALSE,
69 vc = "referTo ratskorpus-2023-1 & pubDate in",
70 suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
71 prefixes = c('', '"', '"', ''),
Marc Kupietz65eede52023-05-12 17:30:07 +020072 kco = new("KorAPConnection", verbose=TRUE) %>% oauthorizeDemo()) {
Marc Kupietz6dd348c2023-05-03 08:26:58 +020073 hc <-
74 frequencyQuery(kco, paste0(prefixes, word, suffixes), paste(vc, years), as.alternatives=as.alternatives) %>%
75 unravelPunctuationGenderCases(kco = kco) %>%
76 hc_freq_by_year_ci(as.alternatives)
77 print(hc)
78 hc
79}
80
81
82hc <- plotPluralGenderVariants("Nutzer", c(1995:2022), as.alternatives = FALSE)
83# htmlwidgets::saveWidget(hc, file=fname, selfcontained = TRUE)
84
Marc Kupietzadc86eb2023-05-10 19:49:15 +020085# Diewald, Nils/Kupietz, Marc/L\u00FCngen, Harald (2022):
Marc Kupietz6dd348c2023-05-03 08:26:58 +020086# Tokenizing on scale. Preprocessing large text corpora on the lexical and sentence level.
Marc Kupietzadc86eb2023-05-10 19:49:15 +020087# In: Klosa-K\u00FCckelhaus, Annette/Engelberg, Stefan/M\u00F6hrs, Christine/Storjohann, Petra (eds):
Marc Kupietz6dd348c2023-05-03 08:26:58 +020088# Dictionaries and Society. Proceedings of the XX EURALEX International Congress, 12-16 July 2022.
89# Mannheim: IDS-Verlag, 2022: 208-221.
90# <https://doi.org/10.14618/ids-pub-11146>
91