blob: 5241987703ac646c97e3486b5b4a6e00dd3e4b2f [file] [log] [blame]
Marc Kupietz6dd348c2023-05-03 08:26:58 +02001library(RKorAPClient)
2library(tidyverse)
3library(purrrlyr)
Marc Kupietz65eede52023-05-12 17:30:07 +02004library(httr2)
5library(httpuv)
Marc Kupietz1ce833d2024-01-24 14:14:21 +01006library(RMariaDB)
7library(tidyfst)
Marc Kupietz65eede52023-05-12 17:30:07 +02008
9demo_kor_app_id = "773NHGM76N7P9b6rLfmpM4"
Marc Kupietz1ce833d2024-01-24 14:14:21 +010010source("https://www.stgries.info/research/dispersion/dispersions.r")
Marc Kupietz6dd348c2023-05-03 08:26:58 +020011
12# The challenge in searching gender variants with KorAP and DeReKo is that,
13# firstly, some characters used for gender marking, especially punctuation marks,
14# are interpreted and indexed as token boundaries and, secondly, punctuation
15# marks are currently not indexed in KorAP.
16#
17# The former is intentional with regard to a majority of use cases and with
Marc Kupietzadc86eb2023-05-10 19:49:15 +020018# regard to the reproducibility maxim (see Diewald/Kupietz/L\u00FCngen 2022).
Marc Kupietz6dd348c2023-05-03 08:26:58 +020019# The latter is a shortcoming in KorAP that will be remedied sooner or later
20# and that can be solved provisionally in the meantime with the help of the KorAP API.
21#
22# The following unravelPunctuationGenderCases function, for example, takes the
23# result of a frequencyQuery for two supposedly consecutive tokens and then looks more
24# closely into the KWIC snippets to see which non-indexed strings actually do appear
25# between these tokens and counts the frequencies of the variants that occur.
26
Marc Kupietz65eede52023-05-12 17:30:07 +020027unravelPunctuationGenderCases <- function(df, suffix = "innen", kco) {
Marc Kupietz6dd348c2023-05-03 08:26:58 +020028 if ( nrow(df) > 1) {
29 df %>%
30 dplyr::filter(totalResults > 0 & str_detect(query, paste0(" ", suffix))) %>%
Marc Kupietz65eede52023-05-12 17:30:07 +020031 by_row(unravelPunctuationGenderCases, kco = kco, .collate = "rows", .labels=FALSE) %>%
Marc Kupietz6dd348c2023-05-03 08:26:58 +020032 select(-.row) %>%
33 bind_rows(df %>% dplyr::filter(totalResults == 0 | ! str_detect(query, paste0(" ", suffix)))) %>%
34 tidyr::complete(query, nesting(vc, total), fill = list(totalResults = 0)) %>%
35 select(-f, -conf.low, -conf.high) %>%
36 RKorAPClient::ci() %>%
37 mutate(query = str_replace_all(query, '(^"|"$|[\\[\\]\\\\])', '')) %>%
38 mutate(query = str_replace_all(query, paste0('\\(', suffix), paste0('(', suffix, ')'))) %>%
39 filter(!str_detect(query, paste0("\\w ", suffix))) # remove "Nutzer innen"
40 } else {
41 q <- corpusQuery(kco, df$query, vc=df$vc, metadataOnly = FALSE) %>%
42 fetchAll()
43 cases <- q@collectedMatches$snippet %>%
44 str_replace_all(paste0(".*<mark>.*\\w(\\W+)", suffix, "</mark>.*"), "\\1") %>%
45 as_tibble() %>%
46 group_by(value) %>%
47 summarise(n = n())
48 df %>% uncount(nrow(cases)) %>%
49 mutate(query = str_replace(query, paste0(" (?=", suffix, ")"), cases$value), totalResults = cases$n)
50 }
51}
52
Marc Kupietz65eede52023-05-12 17:30:07 +020053oauthorizeDemo <- function(kco, app_id = demo_kor_app_id) {
54 if (is.null(kco@accessToken) || is.null(kco@welcome)) { # if access token is not set or invalid
55 kco@accessToken <- ( # request one
56 oauth_client(
57 id = app_id, # for the demo application
58 token_url = paste0(kco@apiUrl, "oauth2/token")
59 ) %>%
60 oauth_flow_auth_code(
61 scope = "search match_info",
62 auth_url = paste0(kco@KorAPUrl, "settings/oauth/authorize")
63 )
64 )$access_token
65 }
66 kco
67}
68
Marc Kupietz6dd348c2023-05-03 08:26:58 +020069plotPluralGenderVariants <- function(word = "Nutzer",
70 years = c(1995:2022),
71 as.alternatives = FALSE,
72 vc = "referTo ratskorpus-2023-1 & pubDate in",
73 suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
74 prefixes = c('', '"', '"', ''),
Marc Kupietz65eede52023-05-12 17:30:07 +020075 kco = new("KorAPConnection", verbose=TRUE) %>% oauthorizeDemo()) {
Marc Kupietz6dd348c2023-05-03 08:26:58 +020076 hc <-
77 frequencyQuery(kco, paste0(prefixes, word, suffixes), paste(vc, years), as.alternatives=as.alternatives) %>%
78 unravelPunctuationGenderCases(kco = kco) %>%
79 hc_freq_by_year_ci(as.alternatives)
80 print(hc)
81 hc
82}
83
84
85hc <- plotPluralGenderVariants("Nutzer", c(1995:2022), as.alternatives = FALSE)
Marc Kupietz1ce833d2024-01-24 14:14:21 +010086
87getOKKSourceTitles <- function() {
88 db <- dbConnect(MariaDB(), host="klinux10.ids-mannheim.de", user="viewer", dbname="corpora")
89 dbExecute(db, "SET NAMES 'utf8'")
90 rs <- dbSendQuery(db, "SELECT title from basename WHERE basename.rsr")
91 corpus_parts <- dbFetch(rs)
92 dbClearResult(rs)
93 dbDisconnect(db)
94 return(corpus_parts$title)
95}
96
97sourceDispersions <- function(word = "Bürger",
98 sourceTitles = getOKKSourceTitles(),
99 as.alternatives = FALSE,
100 vc = 'referTo ratskorpus-2023-1 & corpusTitle="',
101 suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
102 prefixes = c('', '"', '"', ''),
103 kco = new("KorAPConnection", verbose=TRUE) %>% oauthorizeDemo()) {
104 df <-
105 frequencyQuery(kco, paste0(prefixes, word, suffixes), paste0(vc, sourceTitles, '"'), as.alternatives=as.alternatives) %>%
106 unravelPunctuationGenderCases(kco = kco) %>%
107 mutate(Quelle=str_replace_all(.$vc, '(.*="|"$)', '')) %>%
108 ipm() %>%
109 filter(total > 0) %>%
110 rename(Variante=query)
111
112 dispersions <- df %>%
113 group_by(Variante) %>%
114 mutate(total_size=sum(total), rel_size=total/total_size) %>%
115 group_modify(~ as_tibble(dispersions2(.x$totalResults, .x$rel_size))) %>%
116 pivot_longer(cols= -1) %>%
117 filter(! str_detect(name, " equally")) %>%
118 mutate(name=str_replace(name, " ?\\(.*\\)", '')) %>%
119 mutate_when(str_detect(name, "DPnorm"), name = "1-DP_norm", value = 1 -value) %>%
120 mutate(across(where(is.double)), value = round(value, 2)) %>%
121 mutate_when(str_detect(name, "(corpus|range)"), value = as.integer(value)) %>%
122 group_by(name) %>%
123 mutate(rank = if_else(str_detect(name, "Kullback"), rank(value), rank(-value))) %>%
124 group_by(Variante) %>%
125 # mutate(rank = mean(rank, na.rm = FALSE)) %>%
126 bind_rows(summarise(., name="Avg. Rank", value = mean(rank, na.rm = FALSE))) %>%
127 select(-rank) %>%
128 pivot_wider(names_from = Variante) %>%
129 rename(measure=name)
130
131 return(list(df, dispersions))
132}
133
134df_dispersions <- sourceDispersions("Nutzer")
135View(df_dispersions[[2]])
136
Marc Kupietz6dd348c2023-05-03 08:26:58 +0200137# htmlwidgets::saveWidget(hc, file=fname, selfcontained = TRUE)
138
Marc Kupietzadc86eb2023-05-10 19:49:15 +0200139# Diewald, Nils/Kupietz, Marc/L\u00FCngen, Harald (2022):
Marc Kupietz6dd348c2023-05-03 08:26:58 +0200140# Tokenizing on scale. Preprocessing large text corpora on the lexical and sentence level.
Marc Kupietzadc86eb2023-05-10 19:49:15 +0200141# In: Klosa-K\u00FCckelhaus, Annette/Engelberg, Stefan/M\u00F6hrs, Christine/Storjohann, Petra (eds):
Marc Kupietz6dd348c2023-05-03 08:26:58 +0200142# Dictionaries and Society. Proceedings of the XX EURALEX International Congress, 12-16 July 2022.
143# Mannheim: IDS-Verlag, 2022: 208-221.
144# <https://doi.org/10.14618/ids-pub-11146>
145