Add dispersions to gender variants demo
Change-Id: I76f36e254a5417c959e2c2645c14a43c47f5b492
diff --git a/demo/pluralGenderVariants.R b/demo/pluralGenderVariants.R
index 8ff8c3b..5241987 100644
--- a/demo/pluralGenderVariants.R
+++ b/demo/pluralGenderVariants.R
@@ -3,8 +3,11 @@
library(purrrlyr)
library(httr2)
library(httpuv)
+library(RMariaDB)
+library(tidyfst)
demo_kor_app_id = "773NHGM76N7P9b6rLfmpM4"
+source("https://www.stgries.info/research/dispersion/dispersions.r")
# The challenge in searching gender variants with KorAP and DeReKo is that,
# firstly, some characters used for gender marking, especially punctuation marks,
@@ -80,6 +83,57 @@
hc <- plotPluralGenderVariants("Nutzer", c(1995:2022), as.alternatives = FALSE)
+
+getOKKSourceTitles <- function() {
+ db <- dbConnect(MariaDB(), host="klinux10.ids-mannheim.de", user="viewer", dbname="corpora")
+ dbExecute(db, "SET NAMES 'utf8'")
+ rs <- dbSendQuery(db, "SELECT title from basename WHERE basename.rsr")
+ corpus_parts <- dbFetch(rs)
+ dbClearResult(rs)
+ dbDisconnect(db)
+ return(corpus_parts$title)
+}
+
+sourceDispersions <- function(word = "Bürger",
+ sourceTitles = getOKKSourceTitles(),
+ as.alternatives = FALSE,
+ vc = 'referTo ratskorpus-2023-1 & corpusTitle="',
+ suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
+ prefixes = c('', '"', '"', ''),
+ kco = new("KorAPConnection", verbose=TRUE) %>% oauthorizeDemo()) {
+ df <-
+ frequencyQuery(kco, paste0(prefixes, word, suffixes), paste0(vc, sourceTitles, '"'), as.alternatives=as.alternatives) %>%
+ unravelPunctuationGenderCases(kco = kco) %>%
+ mutate(Quelle=str_replace_all(.$vc, '(.*="|"$)', '')) %>%
+ ipm() %>%
+ filter(total > 0) %>%
+ rename(Variante=query)
+
+ dispersions <- df %>%
+ group_by(Variante) %>%
+ mutate(total_size=sum(total), rel_size=total/total_size) %>%
+ group_modify(~ as_tibble(dispersions2(.x$totalResults, .x$rel_size))) %>%
+ pivot_longer(cols= -1) %>%
+ filter(! str_detect(name, " equally")) %>%
+ mutate(name=str_replace(name, " ?\\(.*\\)", '')) %>%
+ mutate_when(str_detect(name, "DPnorm"), name = "1-DP_norm", value = 1 -value) %>%
+ mutate(across(where(is.double)), value = round(value, 2)) %>%
+ mutate_when(str_detect(name, "(corpus|range)"), value = as.integer(value)) %>%
+ group_by(name) %>%
+ mutate(rank = if_else(str_detect(name, "Kullback"), rank(value), rank(-value))) %>%
+ group_by(Variante) %>%
+ # mutate(rank = mean(rank, na.rm = FALSE)) %>%
+ bind_rows(summarise(., name="Avg. Rank", value = mean(rank, na.rm = FALSE))) %>%
+ select(-rank) %>%
+ pivot_wider(names_from = Variante) %>%
+ rename(measure=name)
+
+ return(list(df, dispersions))
+}
+
+df_dispersions <- sourceDispersions("Nutzer")
+View(df_dispersions[[2]])
+
# htmlwidgets::saveWidget(hc, file=fname, selfcontained = TRUE)
# Diewald, Nils/Kupietz, Marc/L\u00FCngen, Harald (2022):