Marc Kupietz | 6dd348c | 2023-05-03 08:26:58 +0200 | [diff] [blame] | 1 | library(RKorAPClient) |
| 2 | library(tidyverse) |
| 3 | library(purrrlyr) |
Marc Kupietz | 65eede5 | 2023-05-12 17:30:07 +0200 | [diff] [blame] | 4 | library(httr2) |
| 5 | library(httpuv) |
Marc Kupietz | 1ce833d | 2024-01-24 14:14:21 +0100 | [diff] [blame^] | 6 | library(RMariaDB) |
| 7 | library(tidyfst) |
Marc Kupietz | 65eede5 | 2023-05-12 17:30:07 +0200 | [diff] [blame] | 8 | |
| 9 | demo_kor_app_id = "773NHGM76N7P9b6rLfmpM4" |
Marc Kupietz | 1ce833d | 2024-01-24 14:14:21 +0100 | [diff] [blame^] | 10 | source("https://www.stgries.info/research/dispersion/dispersions.r") |
Marc Kupietz | 6dd348c | 2023-05-03 08:26:58 +0200 | [diff] [blame] | 11 | |
| 12 | # The challenge in searching gender variants with KorAP and DeReKo is that, |
| 13 | # firstly, some characters used for gender marking, especially punctuation marks, |
| 14 | # are interpreted and indexed as token boundaries and, secondly, punctuation |
| 15 | # marks are currently not indexed in KorAP. |
| 16 | # |
| 17 | # The former is intentional with regard to a majority of use cases and with |
Marc Kupietz | adc86eb | 2023-05-10 19:49:15 +0200 | [diff] [blame] | 18 | # regard to the reproducibility maxim (see Diewald/Kupietz/L\u00FCngen 2022). |
Marc Kupietz | 6dd348c | 2023-05-03 08:26:58 +0200 | [diff] [blame] | 19 | # The latter is a shortcoming in KorAP that will be remedied sooner or later |
| 20 | # and that can be solved provisionally in the meantime with the help of the KorAP API. |
| 21 | # |
| 22 | # The following unravelPunctuationGenderCases function, for example, takes the |
| 23 | # result of a frequencyQuery for two supposedly consecutive tokens and then looks more |
| 24 | # closely into the KWIC snippets to see which non-indexed strings actually do appear |
| 25 | # between these tokens and counts the frequencies of the variants that occur. |
| 26 | |
Marc Kupietz | 65eede5 | 2023-05-12 17:30:07 +0200 | [diff] [blame] | 27 | unravelPunctuationGenderCases <- function(df, suffix = "innen", kco) { |
Marc Kupietz | 6dd348c | 2023-05-03 08:26:58 +0200 | [diff] [blame] | 28 | if ( nrow(df) > 1) { |
| 29 | df %>% |
| 30 | dplyr::filter(totalResults > 0 & str_detect(query, paste0(" ", suffix))) %>% |
Marc Kupietz | 65eede5 | 2023-05-12 17:30:07 +0200 | [diff] [blame] | 31 | by_row(unravelPunctuationGenderCases, kco = kco, .collate = "rows", .labels=FALSE) %>% |
Marc Kupietz | 6dd348c | 2023-05-03 08:26:58 +0200 | [diff] [blame] | 32 | select(-.row) %>% |
| 33 | bind_rows(df %>% dplyr::filter(totalResults == 0 | ! str_detect(query, paste0(" ", suffix)))) %>% |
| 34 | tidyr::complete(query, nesting(vc, total), fill = list(totalResults = 0)) %>% |
| 35 | select(-f, -conf.low, -conf.high) %>% |
| 36 | RKorAPClient::ci() %>% |
| 37 | mutate(query = str_replace_all(query, '(^"|"$|[\\[\\]\\\\])', '')) %>% |
| 38 | mutate(query = str_replace_all(query, paste0('\\(', suffix), paste0('(', suffix, ')'))) %>% |
| 39 | filter(!str_detect(query, paste0("\\w ", suffix))) # remove "Nutzer innen" |
| 40 | } else { |
| 41 | q <- corpusQuery(kco, df$query, vc=df$vc, metadataOnly = FALSE) %>% |
| 42 | fetchAll() |
| 43 | cases <- q@collectedMatches$snippet %>% |
| 44 | str_replace_all(paste0(".*<mark>.*\\w(\\W+)", suffix, "</mark>.*"), "\\1") %>% |
| 45 | as_tibble() %>% |
| 46 | group_by(value) %>% |
| 47 | summarise(n = n()) |
| 48 | df %>% uncount(nrow(cases)) %>% |
| 49 | mutate(query = str_replace(query, paste0(" (?=", suffix, ")"), cases$value), totalResults = cases$n) |
| 50 | } |
| 51 | } |
| 52 | |
Marc Kupietz | 65eede5 | 2023-05-12 17:30:07 +0200 | [diff] [blame] | 53 | oauthorizeDemo <- function(kco, app_id = demo_kor_app_id) { |
| 54 | if (is.null(kco@accessToken) || is.null(kco@welcome)) { # if access token is not set or invalid |
| 55 | kco@accessToken <- ( # request one |
| 56 | oauth_client( |
| 57 | id = app_id, # for the demo application |
| 58 | token_url = paste0(kco@apiUrl, "oauth2/token") |
| 59 | ) %>% |
| 60 | oauth_flow_auth_code( |
| 61 | scope = "search match_info", |
| 62 | auth_url = paste0(kco@KorAPUrl, "settings/oauth/authorize") |
| 63 | ) |
| 64 | )$access_token |
| 65 | } |
| 66 | kco |
| 67 | } |
| 68 | |
Marc Kupietz | 6dd348c | 2023-05-03 08:26:58 +0200 | [diff] [blame] | 69 | plotPluralGenderVariants <- function(word = "Nutzer", |
| 70 | years = c(1995:2022), |
| 71 | as.alternatives = FALSE, |
| 72 | vc = "referTo ratskorpus-2023-1 & pubDate in", |
| 73 | suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'), |
| 74 | prefixes = c('', '"', '"', ''), |
Marc Kupietz | 65eede5 | 2023-05-12 17:30:07 +0200 | [diff] [blame] | 75 | kco = new("KorAPConnection", verbose=TRUE) %>% oauthorizeDemo()) { |
Marc Kupietz | 6dd348c | 2023-05-03 08:26:58 +0200 | [diff] [blame] | 76 | hc <- |
| 77 | frequencyQuery(kco, paste0(prefixes, word, suffixes), paste(vc, years), as.alternatives=as.alternatives) %>% |
| 78 | unravelPunctuationGenderCases(kco = kco) %>% |
| 79 | hc_freq_by_year_ci(as.alternatives) |
| 80 | print(hc) |
| 81 | hc |
| 82 | } |
| 83 | |
| 84 | |
| 85 | hc <- plotPluralGenderVariants("Nutzer", c(1995:2022), as.alternatives = FALSE) |
Marc Kupietz | 1ce833d | 2024-01-24 14:14:21 +0100 | [diff] [blame^] | 86 | |
| 87 | getOKKSourceTitles <- function() { |
| 88 | db <- dbConnect(MariaDB(), host="klinux10.ids-mannheim.de", user="viewer", dbname="corpora") |
| 89 | dbExecute(db, "SET NAMES 'utf8'") |
| 90 | rs <- dbSendQuery(db, "SELECT title from basename WHERE basename.rsr") |
| 91 | corpus_parts <- dbFetch(rs) |
| 92 | dbClearResult(rs) |
| 93 | dbDisconnect(db) |
| 94 | return(corpus_parts$title) |
| 95 | } |
| 96 | |
| 97 | sourceDispersions <- function(word = "Bürger", |
| 98 | sourceTitles = getOKKSourceTitles(), |
| 99 | as.alternatives = FALSE, |
| 100 | vc = 'referTo ratskorpus-2023-1 & corpusTitle="', |
| 101 | suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'), |
| 102 | prefixes = c('', '"', '"', ''), |
| 103 | kco = new("KorAPConnection", verbose=TRUE) %>% oauthorizeDemo()) { |
| 104 | df <- |
| 105 | frequencyQuery(kco, paste0(prefixes, word, suffixes), paste0(vc, sourceTitles, '"'), as.alternatives=as.alternatives) %>% |
| 106 | unravelPunctuationGenderCases(kco = kco) %>% |
| 107 | mutate(Quelle=str_replace_all(.$vc, '(.*="|"$)', '')) %>% |
| 108 | ipm() %>% |
| 109 | filter(total > 0) %>% |
| 110 | rename(Variante=query) |
| 111 | |
| 112 | dispersions <- df %>% |
| 113 | group_by(Variante) %>% |
| 114 | mutate(total_size=sum(total), rel_size=total/total_size) %>% |
| 115 | group_modify(~ as_tibble(dispersions2(.x$totalResults, .x$rel_size))) %>% |
| 116 | pivot_longer(cols= -1) %>% |
| 117 | filter(! str_detect(name, " equally")) %>% |
| 118 | mutate(name=str_replace(name, " ?\\(.*\\)", '')) %>% |
| 119 | mutate_when(str_detect(name, "DPnorm"), name = "1-DP_norm", value = 1 -value) %>% |
| 120 | mutate(across(where(is.double)), value = round(value, 2)) %>% |
| 121 | mutate_when(str_detect(name, "(corpus|range)"), value = as.integer(value)) %>% |
| 122 | group_by(name) %>% |
| 123 | mutate(rank = if_else(str_detect(name, "Kullback"), rank(value), rank(-value))) %>% |
| 124 | group_by(Variante) %>% |
| 125 | # mutate(rank = mean(rank, na.rm = FALSE)) %>% |
| 126 | bind_rows(summarise(., name="Avg. Rank", value = mean(rank, na.rm = FALSE))) %>% |
| 127 | select(-rank) %>% |
| 128 | pivot_wider(names_from = Variante) %>% |
| 129 | rename(measure=name) |
| 130 | |
| 131 | return(list(df, dispersions)) |
| 132 | } |
| 133 | |
| 134 | df_dispersions <- sourceDispersions("Nutzer") |
| 135 | View(df_dispersions[[2]]) |
| 136 | |
Marc Kupietz | 6dd348c | 2023-05-03 08:26:58 +0200 | [diff] [blame] | 137 | # htmlwidgets::saveWidget(hc, file=fname, selfcontained = TRUE) |
| 138 | |
Marc Kupietz | adc86eb | 2023-05-10 19:49:15 +0200 | [diff] [blame] | 139 | # Diewald, Nils/Kupietz, Marc/L\u00FCngen, Harald (2022): |
Marc Kupietz | 6dd348c | 2023-05-03 08:26:58 +0200 | [diff] [blame] | 140 | # Tokenizing on scale. Preprocessing large text corpora on the lexical and sentence level. |
Marc Kupietz | adc86eb | 2023-05-10 19:49:15 +0200 | [diff] [blame] | 141 | # In: Klosa-K\u00FCckelhaus, Annette/Engelberg, Stefan/M\u00F6hrs, Christine/Storjohann, Petra (eds): |
Marc Kupietz | 6dd348c | 2023-05-03 08:26:58 +0200 | [diff] [blame] | 142 | # Dictionaries and Society. Proceedings of the XX EURALEX International Congress, 12-16 July 2022. |
| 143 | # Mannheim: IDS-Verlag, 2022: 208-221. |
| 144 | # <https://doi.org/10.14618/ids-pub-11146> |
| 145 | |