| Marc Kupietz | 43bb93f | 2025-09-25 20:59:14 +0200 | [diff] [blame^] | 1 | library(RKorAPClient) |
| 2 | library(tidyverse) |
| 3 | library(urltools) |
| 4 | library(tidyllm) |
| 5 | |
| 6 | VC <- "author=/Dickens.*/" |
| 7 | deliko <- KorAPConnection("https://korap.dnb.de", verbose = TRUE) |
| 8 | query <- corpusQuery( |
| 9 | deliko, |
| 10 | "<base/s=t>", |
| 11 | # this finds each text once |
| 12 | vc = VC, |
| 13 | fields = c( |
| 14 | "textSigle", |
| 15 | "title", |
| 16 | "subTitle", |
| 17 | "author", |
| 18 | "translator", |
| 19 | "textType", |
| 20 | "textTypeRef", |
| 21 | "publisher", |
| 22 | "pubDate", |
| 23 | "pubPlace", |
| 24 | "ISBN", |
| 25 | "URN" |
| 26 | ) |
| 27 | ) %>% |
| 28 | fetchNext(maxFetch = 2, randomizePageOrder = TRUE) |
| 29 | |
| 30 | df <- query@collectedMatches |
| 31 | |
| 32 | df <- df %>% |
| 33 | mutate( |
| 34 | publisher = str_replace_all(publisher, "data:,", ""), |
| 35 | URN = URLdecode(str_replace_all(URN, ".*,", "")) |
| 36 | ) %>% |
| 37 | select(-c("matchStart", "matchEnd")) |
| 38 | |
| 39 | # write_tsv(df, "deliko_metadata.tsv") |
| 40 | |
| 41 | # Build a compact prompt requesting strict JSON for translation/original info |
| 42 | build_translation_prompt <- function( |
| 43 | title, |
| 44 | author = NA_character_, |
| 45 | subTitle = NA_character_, |
| 46 | translator = NA_character_, |
| 47 | publisher = NA_character_, |
| 48 | pubDate = NA_character_, |
| 49 | ISBN = NA_character_) { |
| 50 | fields <- c( |
| 51 | sprintf("Titel: %s", dplyr::coalesce(title, "")), |
| 52 | sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")), |
| 53 | sprintf("Autor/in: %s", dplyr::coalesce(author, "")), |
| 54 | sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")), |
| 55 | sprintf("Verlag: %s", dplyr::coalesce(publisher, "")), |
| 56 | sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")), |
| 57 | sprintf("ISBN: %s", dplyr::coalesce(ISBN, "")) |
| 58 | ) |
| 59 | context <- paste(fields, collapse = "\n") |
| 60 | paste0( |
| 61 | "Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:", |
| 62 | "\n1) Ob es eine Übersetzung ist.", |
| 63 | "\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.", |
| 64 | "\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:", |
| 65 | "\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string|null>,\n \"original_publication_year\": <integer|null>,\n \"confidence\": <number zwischen 0 und 1>\n}", |
| 66 | "\nNutze null für Unbekanntes. Antworte nur mit JSON.", |
| 67 | "\n\nMetadaten:\n", |
| 68 | context |
| 69 | ) |
| 70 | } |
| 71 | |
| 72 | # Resolve tidyllm provider from model prefix |
| 73 | resolve_provider_for_model <- function(model) { |
| 74 | if (grepl("^gpt-", model, ignore.case = TRUE)) { |
| 75 | return(tidyllm::openai()) |
| 76 | } else if (grepl("^claude-", model, ignore.case = TRUE)) { |
| 77 | return(tidyllm::claude()) |
| 78 | } else if (grepl("^gemini-", model, ignore.case = TRUE)) { |
| 79 | return(tidyllm::gemini()) |
| 80 | } else if (grepl("^deepseek", model, ignore.case = TRUE)) { |
| 81 | return(tidyllm::deepseek()) |
| 82 | } |
| 83 | # Default fallback |
| 84 | tidyllm::deepseek() |
| 85 | } |
| 86 | |
| 87 | # Friendly API key checks per provider family |
| 88 | ensure_api_key_for_model <- function(model) { |
| 89 | if (grepl("^gpt-", model, ignore.case = TRUE)) { |
| 90 | if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) { |
| 91 | stop("OPENAI_API_KEY is not set. Set it via Sys.setenv('OPENAI_API_KEY'='...') or source your shell env.") |
| 92 | } |
| 93 | } else if (grepl("^claude-", model, ignore.case = TRUE)) { |
| 94 | if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) { |
| 95 | stop("ANTHROPIC_API_KEY is not set. Set it via Sys.setenv('ANTHROPIC_API_KEY'='...') or source your shell env.") |
| 96 | } |
| 97 | } else if (grepl("^gemini-", model, ignore.case = TRUE)) { |
| 98 | if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) { |
| 99 | stop("GOOGLE_API_KEY is not set. Set it via Sys.setenv('GOOGLE_API_KEY'='...') or source your shell env.") |
| 100 | } |
| 101 | } else if (grepl("^deepseek", model, ignore.case = TRUE)) { |
| 102 | if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) { |
| 103 | stop("DEEPSEEK_API_KEY is not set. Set it via Sys.setenv('DEEPSEEK_API_KEY'='...') or source your shell env.") |
| 104 | } |
| 105 | } |
| 106 | invisible(TRUE) |
| 107 | } |
| 108 | |
| 109 | # Infer original title and year for a single record via LLM (DeepSeek by default) |
| 110 | infer_original_single <- function( |
| 111 | title, |
| 112 | author = NA_character_, |
| 113 | subTitle = NA_character_, |
| 114 | translator = NA_character_, |
| 115 | publisher = NA_character_, |
| 116 | pubDate = NA_character_, |
| 117 | ISBN = NA_character_, |
| 118 | provider = NULL, |
| 119 | model = "deepseek-chat", |
| 120 | temperature = 0.1, |
| 121 | max_tries = 3, |
| 122 | timeout = 60) { |
| 123 | prompt <- build_translation_prompt( |
| 124 | title = title, |
| 125 | author = author, |
| 126 | subTitle = subTitle, |
| 127 | translator = translator, |
| 128 | publisher = publisher, |
| 129 | pubDate = pubDate, |
| 130 | ISBN = ISBN |
| 131 | ) |
| 132 | |
| 133 | # Resolve provider if not supplied |
| 134 | if (is.null(provider)) { |
| 135 | provider <- resolve_provider_for_model(model) |
| 136 | } |
| 137 | |
| 138 | result <- try( |
| 139 | { |
| 140 | tidyllm::llm_message(prompt) |> |
| 141 | tidyllm::chat( |
| 142 | .provider = provider, |
| 143 | .model = model, |
| 144 | .temperature = temperature, |
| 145 | .timeout = timeout, |
| 146 | .max_tries = max_tries |
| 147 | ) |
| 148 | }, |
| 149 | silent = TRUE |
| 150 | ) |
| 151 | |
| 152 | if (inherits(result, "try-error")) { |
| 153 | return(tibble::tibble( |
| 154 | original_title = NA_character_, |
| 155 | original_publication_year = as.integer(NA), |
| 156 | orig_confidence = as.numeric(NA) |
| 157 | )) |
| 158 | } |
| 159 | |
| 160 | # Prefer JSON parsing via tidyllm helper; falls back to NA on failure |
| 161 | parsed <- NULL |
| 162 | suppressWarnings({ |
| 163 | parsed <- tidyllm::get_reply_data(result) |
| 164 | }) |
| 165 | |
| 166 | if (is.null(parsed)) { |
| 167 | # Try to extract JSON manually as a very last resort |
| 168 | raw <- tidyllm::get_reply(result) |
| 169 | json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}") |
| 170 | if (!is.na(json_txt)) { |
| 171 | parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE) |
| 172 | if (inherits(parsed, "try-error")) parsed <- NULL |
| 173 | } |
| 174 | } |
| 175 | |
| 176 | if (is.null(parsed)) { |
| 177 | return(tibble::tibble( |
| 178 | original_title = NA_character_, |
| 179 | original_publication_year = as.integer(NA), |
| 180 | orig_confidence = as.numeric(NA) |
| 181 | )) |
| 182 | } |
| 183 | |
| 184 | ot <- parsed[["original_title"]] |
| 185 | oy <- suppressWarnings(as.integer(parsed[["original_publication_year"]])) |
| 186 | cf <- suppressWarnings(as.numeric(parsed[["confidence"]])) |
| 187 | |
| 188 | if (!is.na(cf)) cf <- max(0, min(1, cf)) |
| 189 | |
| 190 | tibble::tibble( |
| 191 | original_title = dplyr::if_else(is.na(ot) | is.null(ot), NA_character_, as.character(ot)), |
| 192 | original_publication_year = dplyr::if_else(is.na(oy), as.integer(NA), as.integer(oy)), |
| 193 | orig_confidence = dplyr::if_else(is.na(cf), as.numeric(NA), as.numeric(cf)) |
| 194 | ) |
| 195 | } |
| 196 | |
| 197 | # Public API: augment an existing metadata df with original title/year (+ optional confidence) |
| 198 | augment_metadata_with_original <- function( |
| 199 | df, |
| 200 | provider = NULL, |
| 201 | model = "deepseek-chat", |
| 202 | include_confidence = TRUE, |
| 203 | temperature = 0.1, |
| 204 | max_tries = 3, |
| 205 | timeout = 60, |
| 206 | pause_seconds = 0, |
| 207 | check_api_key = TRUE, |
| 208 | show_progress = TRUE, |
| 209 | progress_style = 3) { |
| 210 | stopifnot(is.data.frame(df)) |
| 211 | |
| 212 | needed <- c("title", "author", "subTitle", "translator", "publisher", "pubDate", "ISBN") |
| 213 | missing <- setdiff(needed, names(df)) |
| 214 | if (length(missing) > 0) { |
| 215 | stop(sprintf("Missing required columns: %s", paste(missing, collapse = ", "))) |
| 216 | } |
| 217 | |
| 218 | # Friendly pre-check for required API key depending on the chosen model/provider |
| 219 | if (isTRUE(check_api_key)) { |
| 220 | ensure_api_key_for_model(model) |
| 221 | } |
| 222 | |
| 223 | # Resolve provider if not supplied |
| 224 | if (is.null(provider)) { |
| 225 | provider <- resolve_provider_for_model(model) |
| 226 | } |
| 227 | |
| 228 | # Prepare inputs and iterate using purrr::pmap_dfr to avoid NSE warnings |
| 229 | input_cols <- tibble::tibble( |
| 230 | title = df[["title"]], |
| 231 | author = df[["author"]], |
| 232 | subTitle = df[["subTitle"]], |
| 233 | translator = df[["translator"]], |
| 234 | publisher = df[["publisher"]], |
| 235 | pubDate = df[["pubDate"]], |
| 236 | ISBN = df[["ISBN"]] |
| 237 | ) |
| 238 | |
| 239 | infer_wrapper <- function(title, author, subTitle, translator, publisher, pubDate, ISBN) { |
| 240 | res <- infer_original_single( |
| 241 | title = title, |
| 242 | author = author, |
| 243 | subTitle = subTitle, |
| 244 | translator = translator, |
| 245 | publisher = publisher, |
| 246 | pubDate = pubDate, |
| 247 | ISBN = ISBN, |
| 248 | provider = provider, |
| 249 | model = model, |
| 250 | temperature = temperature, |
| 251 | max_tries = max_tries, |
| 252 | timeout = timeout |
| 253 | ) |
| 254 | if (pause_seconds > 0) Sys.sleep(pause_seconds) |
| 255 | res |
| 256 | } |
| 257 | |
| 258 | n <- nrow(input_cols) |
| 259 | results_list <- vector("list", n) |
| 260 | empty_res <- tibble::tibble( |
| 261 | original_title = NA_character_, |
| 262 | original_publication_year = as.integer(NA), |
| 263 | orig_confidence = as.numeric(NA) |
| 264 | ) |
| 265 | pb <- NULL |
| 266 | if (isTRUE(show_progress) && n > 0) { |
| 267 | pb <- utils::txtProgressBar(min = 0, max = n, style = progress_style) |
| 268 | } |
| 269 | for (i in seq_len(n)) { |
| 270 | res <- tryCatch( |
| 271 | infer_wrapper( |
| 272 | title = input_cols$title[i], |
| 273 | author = input_cols$author[i], |
| 274 | subTitle = input_cols$subTitle[i], |
| 275 | translator = input_cols$translator[i], |
| 276 | publisher = input_cols$publisher[i], |
| 277 | pubDate = input_cols$pubDate[i], |
| 278 | ISBN = input_cols$ISBN[i] |
| 279 | ), |
| 280 | error = function(e) empty_res |
| 281 | ) |
| 282 | if (is.null(res) || !is.data.frame(res) || nrow(res) < 1) { |
| 283 | res <- empty_res |
| 284 | } else if (nrow(res) > 1) { |
| 285 | res <- res[1, , drop = FALSE] |
| 286 | } |
| 287 | results_list[[i]] <- res |
| 288 | if (!is.null(pb)) utils::setTxtProgressBar(pb, i) |
| 289 | } |
| 290 | if (!is.null(pb)) close(pb) |
| 291 | results_tbl <- dplyr::bind_rows(results_list) |
| 292 | if (nrow(results_tbl) != n) { |
| 293 | # Ensure row count aligns with df to avoid recycling errors |
| 294 | if (nrow(results_tbl) < n) { |
| 295 | pad <- dplyr::bind_rows(rep(list(empty_res), n - nrow(results_tbl))) |
| 296 | results_tbl <- dplyr::bind_rows(results_tbl, pad) |
| 297 | } else if (nrow(results_tbl) > n) { |
| 298 | results_tbl <- dplyr::slice(results_tbl, seq_len(n)) |
| 299 | } |
| 300 | } |
| 301 | |
| 302 | out <- dplyr::bind_cols(df, results_tbl) |
| 303 | if (!include_confidence) out <- dplyr::select(out, -"orig_confidence") |
| 304 | out |
| 305 | } |
| 306 | |
| 307 | # df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro") |
| 308 | df_aug <- augment_metadata_with_original(df, model = "deepseek") |
| 309 | |
| 310 | # # OpenAI |
| 311 | # df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini") |
| 312 | |
| 313 | # # Claude |
| 314 | #df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest") |
| 315 | |
| 316 | # # Gemini |
| 317 | # df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro") |