blob: 5241987703ac646c97e3486b5b4a6e00dd3e4b2f [file] [log] [blame]
library(RKorAPClient)
library(tidyverse)
library(purrrlyr)
library(httr2)
library(httpuv)
library(RMariaDB)
library(tidyfst)
demo_kor_app_id = "773NHGM76N7P9b6rLfmpM4"
source("https://www.stgries.info/research/dispersion/dispersions.r")
# The challenge in searching gender variants with KorAP and DeReKo is that,
# firstly, some characters used for gender marking, especially punctuation marks,
# are interpreted and indexed as token boundaries and, secondly, punctuation
# marks are currently not indexed in KorAP.
#
# The former is intentional with regard to a majority of use cases and with
# regard to the reproducibility maxim (see Diewald/Kupietz/L\u00FCngen 2022).
# The latter is a shortcoming in KorAP that will be remedied sooner or later
# and that can be solved provisionally in the meantime with the help of the KorAP API.
#
# The following unravelPunctuationGenderCases function, for example, takes the
# result of a frequencyQuery for two supposedly consecutive tokens and then looks more
# closely into the KWIC snippets to see which non-indexed strings actually do appear
# between these tokens and counts the frequencies of the variants that occur.
unravelPunctuationGenderCases <- function(df, suffix = "innen", kco) {
if ( nrow(df) > 1) {
df %>%
dplyr::filter(totalResults > 0 & str_detect(query, paste0(" ", suffix))) %>%
by_row(unravelPunctuationGenderCases, kco = kco, .collate = "rows", .labels=FALSE) %>%
select(-.row) %>%
bind_rows(df %>% dplyr::filter(totalResults == 0 | ! str_detect(query, paste0(" ", suffix)))) %>%
tidyr::complete(query, nesting(vc, total), fill = list(totalResults = 0)) %>%
select(-f, -conf.low, -conf.high) %>%
RKorAPClient::ci() %>%
mutate(query = str_replace_all(query, '(^"|"$|[\\[\\]\\\\])', '')) %>%
mutate(query = str_replace_all(query, paste0('\\(', suffix), paste0('(', suffix, ')'))) %>%
filter(!str_detect(query, paste0("\\w ", suffix))) # remove "Nutzer innen"
} else {
q <- corpusQuery(kco, df$query, vc=df$vc, metadataOnly = FALSE) %>%
fetchAll()
cases <- q@collectedMatches$snippet %>%
str_replace_all(paste0(".*<mark>.*\\w(\\W+)", suffix, "</mark>.*"), "\\1") %>%
as_tibble() %>%
group_by(value) %>%
summarise(n = n())
df %>% uncount(nrow(cases)) %>%
mutate(query = str_replace(query, paste0(" (?=", suffix, ")"), cases$value), totalResults = cases$n)
}
}
oauthorizeDemo <- function(kco, app_id = demo_kor_app_id) {
if (is.null(kco@accessToken) || is.null(kco@welcome)) { # if access token is not set or invalid
kco@accessToken <- ( # request one
oauth_client(
id = app_id, # for the demo application
token_url = paste0(kco@apiUrl, "oauth2/token")
) %>%
oauth_flow_auth_code(
scope = "search match_info",
auth_url = paste0(kco@KorAPUrl, "settings/oauth/authorize")
)
)$access_token
}
kco
}
plotPluralGenderVariants <- function(word = "Nutzer",
years = c(1995:2022),
as.alternatives = FALSE,
vc = "referTo ratskorpus-2023-1 & pubDate in",
suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
prefixes = c('', '"', '"', ''),
kco = new("KorAPConnection", verbose=TRUE) %>% oauthorizeDemo()) {
hc <-
frequencyQuery(kco, paste0(prefixes, word, suffixes), paste(vc, years), as.alternatives=as.alternatives) %>%
unravelPunctuationGenderCases(kco = kco) %>%
hc_freq_by_year_ci(as.alternatives)
print(hc)
hc
}
hc <- plotPluralGenderVariants("Nutzer", c(1995:2022), as.alternatives = FALSE)
getOKKSourceTitles <- function() {
db <- dbConnect(MariaDB(), host="klinux10.ids-mannheim.de", user="viewer", dbname="corpora")
dbExecute(db, "SET NAMES 'utf8'")
rs <- dbSendQuery(db, "SELECT title from basename WHERE basename.rsr")
corpus_parts <- dbFetch(rs)
dbClearResult(rs)
dbDisconnect(db)
return(corpus_parts$title)
}
sourceDispersions <- function(word = "Bürger",
sourceTitles = getOKKSourceTitles(),
as.alternatives = FALSE,
vc = 'referTo ratskorpus-2023-1 & corpusTitle="',
suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
prefixes = c('', '"', '"', ''),
kco = new("KorAPConnection", verbose=TRUE) %>% oauthorizeDemo()) {
df <-
frequencyQuery(kco, paste0(prefixes, word, suffixes), paste0(vc, sourceTitles, '"'), as.alternatives=as.alternatives) %>%
unravelPunctuationGenderCases(kco = kco) %>%
mutate(Quelle=str_replace_all(.$vc, '(.*="|"$)', '')) %>%
ipm() %>%
filter(total > 0) %>%
rename(Variante=query)
dispersions <- df %>%
group_by(Variante) %>%
mutate(total_size=sum(total), rel_size=total/total_size) %>%
group_modify(~ as_tibble(dispersions2(.x$totalResults, .x$rel_size))) %>%
pivot_longer(cols= -1) %>%
filter(! str_detect(name, " equally")) %>%
mutate(name=str_replace(name, " ?\\(.*\\)", '')) %>%
mutate_when(str_detect(name, "DPnorm"), name = "1-DP_norm", value = 1 -value) %>%
mutate(across(where(is.double)), value = round(value, 2)) %>%
mutate_when(str_detect(name, "(corpus|range)"), value = as.integer(value)) %>%
group_by(name) %>%
mutate(rank = if_else(str_detect(name, "Kullback"), rank(value), rank(-value))) %>%
group_by(Variante) %>%
# mutate(rank = mean(rank, na.rm = FALSE)) %>%
bind_rows(summarise(., name="Avg. Rank", value = mean(rank, na.rm = FALSE))) %>%
select(-rank) %>%
pivot_wider(names_from = Variante) %>%
rename(measure=name)
return(list(df, dispersions))
}
df_dispersions <- sourceDispersions("Nutzer")
View(df_dispersions[[2]])
# htmlwidgets::saveWidget(hc, file=fname, selfcontained = TRUE)
# Diewald, Nils/Kupietz, Marc/L\u00FCngen, Harald (2022):
# Tokenizing on scale. Preprocessing large text corpora on the lexical and sentence level.
# In: Klosa-K\u00FCckelhaus, Annette/Engelberg, Stefan/M\u00F6hrs, Christine/Storjohann, Petra (eds):
# Dictionaries and Society. Proceedings of the XX EURALEX International Congress, 12-16 July 2022.
# Mannheim: IDS-Verlag, 2022: 208-221.
# <https://doi.org/10.14618/ids-pub-11146>