| Marc Kupietz | 43bb93f | 2025-09-25 20:59:14 +0200 | [diff] [blame] | 1 | library(RKorAPClient) |
| 2 | library(tidyverse) |
| 3 | library(urltools) |
| 4 | library(tidyllm) |
| 5 | |
| 6 | VC <- "author=/Dickens.*/" |
| 7 | deliko <- KorAPConnection("https://korap.dnb.de", verbose = TRUE) |
| 8 | query <- corpusQuery( |
| 9 | deliko, |
| 10 | "<base/s=t>", |
| 11 | # this finds each text once |
| 12 | vc = VC, |
| 13 | fields = c( |
| 14 | "textSigle", |
| 15 | "title", |
| 16 | "subTitle", |
| 17 | "author", |
| 18 | "translator", |
| 19 | "textType", |
| 20 | "textTypeRef", |
| 21 | "publisher", |
| 22 | "pubDate", |
| 23 | "pubPlace", |
| 24 | "ISBN", |
| 25 | "URN" |
| 26 | ) |
| 27 | ) %>% |
| 28 | fetchNext(maxFetch = 2, randomizePageOrder = TRUE) |
| 29 | |
| 30 | df <- query@collectedMatches |
| 31 | |
| 32 | df <- df %>% |
| 33 | mutate( |
| 34 | publisher = str_replace_all(publisher, "data:,", ""), |
| 35 | URN = URLdecode(str_replace_all(URN, ".*,", "")) |
| 36 | ) %>% |
| 37 | select(-c("matchStart", "matchEnd")) |
| 38 | |
| 39 | # write_tsv(df, "deliko_metadata.tsv") |
| 40 | |
| 41 | # Build a compact prompt requesting strict JSON for translation/original info |
| 42 | build_translation_prompt <- function( |
| 43 | title, |
| 44 | author = NA_character_, |
| 45 | subTitle = NA_character_, |
| 46 | translator = NA_character_, |
| 47 | publisher = NA_character_, |
| 48 | pubDate = NA_character_, |
| Marc Kupietz | 1bfd830 | 2025-09-25 21:14:45 +0200 | [diff] [blame] | 49 | ISBN = NA_character_, |
| 50 | prompt_language = c("en", "de")) { |
| 51 | prompt_language <- match.arg(prompt_language) |
| 52 | if (prompt_language == "de") { |
| 53 | fields <- c( |
| 54 | sprintf("Titel: %s", dplyr::coalesce(title, "")), |
| 55 | sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")), |
| 56 | sprintf("Autor/in: %s", dplyr::coalesce(author, "")), |
| 57 | sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")), |
| 58 | sprintf("Verlag: %s", dplyr::coalesce(publisher, "")), |
| 59 | sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")), |
| 60 | sprintf("ISBN: %s", dplyr::coalesce(ISBN, "")) |
| 61 | ) |
| 62 | context <- paste(fields, collapse = "\n") |
| 63 | paste0( |
| 64 | "Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:", |
| 65 | "\n1) Ob es eine Übersetzung ist.", |
| 66 | "\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.", |
| 67 | "\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:", |
| 68 | "\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string|null>,\n \"original_publication_year\": <integer|null>,\n \"confidence\": <number zwischen 0 und 1>\n}", |
| 69 | "\nNutze null für Unbekanntes. Antworte nur mit JSON.", |
| 70 | "\n\nMetadaten:\n", |
| 71 | context |
| 72 | ) |
| 73 | } else { |
| 74 | fields <- c( |
| 75 | sprintf("Title: %s", dplyr::coalesce(title, "")), |
| 76 | sprintf("Subtitle: %s", dplyr::coalesce(subTitle, "")), |
| 77 | sprintf("Author: %s", dplyr::coalesce(author, "")), |
| 78 | sprintf("Translator: %s", dplyr::coalesce(translator, "")), |
| 79 | sprintf("Publisher: %s", dplyr::coalesce(publisher, "")), |
| 80 | sprintf("Publication year (German edition): %s", dplyr::coalesce(pubDate, "")), |
| 81 | sprintf("ISBN: %s", dplyr::coalesce(ISBN, "")) |
| 82 | ) |
| 83 | context <- paste(fields, collapse = "\n") |
| 84 | paste0( |
| 85 | "You are a bibliographic assistant. For the following German-language book, determine:", |
| 86 | "\n1) Whether it is a translation.", |
| 87 | "\n2) If yes: the original title and the year of first publication.", |
| 88 | "\n\nReturn a single JSON object only — no prose — with exactly these keys:", |
| 89 | "\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string|null>,\n \"original_publication_year\": <integer|null>,\n \"confidence\": <number between 0 and 1>\n}", |
| 90 | "\nUse null for unknown. Output JSON only.", |
| 91 | "\n\nMetadata:\n", |
| 92 | context |
| 93 | ) |
| 94 | } |
| Marc Kupietz | 43bb93f | 2025-09-25 20:59:14 +0200 | [diff] [blame] | 95 | } |
| 96 | |
| 97 | # Resolve tidyllm provider from model prefix |
| 98 | resolve_provider_for_model <- function(model) { |
| 99 | if (grepl("^gpt-", model, ignore.case = TRUE)) { |
| 100 | return(tidyllm::openai()) |
| 101 | } else if (grepl("^claude-", model, ignore.case = TRUE)) { |
| 102 | return(tidyllm::claude()) |
| 103 | } else if (grepl("^gemini-", model, ignore.case = TRUE)) { |
| 104 | return(tidyllm::gemini()) |
| 105 | } else if (grepl("^deepseek", model, ignore.case = TRUE)) { |
| 106 | return(tidyllm::deepseek()) |
| 107 | } |
| 108 | # Default fallback |
| 109 | tidyllm::deepseek() |
| 110 | } |
| 111 | |
| 112 | # Friendly API key checks per provider family |
| 113 | ensure_api_key_for_model <- function(model) { |
| 114 | if (grepl("^gpt-", model, ignore.case = TRUE)) { |
| 115 | if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) { |
| 116 | stop("OPENAI_API_KEY is not set. Set it via Sys.setenv('OPENAI_API_KEY'='...') or source your shell env.") |
| 117 | } |
| 118 | } else if (grepl("^claude-", model, ignore.case = TRUE)) { |
| 119 | if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) { |
| 120 | stop("ANTHROPIC_API_KEY is not set. Set it via Sys.setenv('ANTHROPIC_API_KEY'='...') or source your shell env.") |
| 121 | } |
| 122 | } else if (grepl("^gemini-", model, ignore.case = TRUE)) { |
| 123 | if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) { |
| 124 | stop("GOOGLE_API_KEY is not set. Set it via Sys.setenv('GOOGLE_API_KEY'='...') or source your shell env.") |
| 125 | } |
| 126 | } else if (grepl("^deepseek", model, ignore.case = TRUE)) { |
| 127 | if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) { |
| 128 | stop("DEEPSEEK_API_KEY is not set. Set it via Sys.setenv('DEEPSEEK_API_KEY'='...') or source your shell env.") |
| 129 | } |
| 130 | } |
| 131 | invisible(TRUE) |
| 132 | } |
| 133 | |
| 134 | # Infer original title and year for a single record via LLM (DeepSeek by default) |
| 135 | infer_original_single <- function( |
| 136 | title, |
| 137 | author = NA_character_, |
| 138 | subTitle = NA_character_, |
| 139 | translator = NA_character_, |
| 140 | publisher = NA_character_, |
| 141 | pubDate = NA_character_, |
| 142 | ISBN = NA_character_, |
| 143 | provider = NULL, |
| 144 | model = "deepseek-chat", |
| 145 | temperature = 0.1, |
| 146 | max_tries = 3, |
| 147 | timeout = 60) { |
| 148 | prompt <- build_translation_prompt( |
| 149 | title = title, |
| 150 | author = author, |
| 151 | subTitle = subTitle, |
| 152 | translator = translator, |
| 153 | publisher = publisher, |
| 154 | pubDate = pubDate, |
| Marc Kupietz | 1bfd830 | 2025-09-25 21:14:45 +0200 | [diff] [blame] | 155 | ISBN = ISBN, |
| 156 | prompt_language = "en" |
| Marc Kupietz | 43bb93f | 2025-09-25 20:59:14 +0200 | [diff] [blame] | 157 | ) |
| 158 | |
| 159 | # Resolve provider if not supplied |
| 160 | if (is.null(provider)) { |
| 161 | provider <- resolve_provider_for_model(model) |
| 162 | } |
| 163 | |
| Marc Kupietz | 1bfd830 | 2025-09-25 21:14:45 +0200 | [diff] [blame] | 164 | result <- NULL |
| 165 | chat_err <- NULL |
| 166 | tryCatch( |
| Marc Kupietz | 43bb93f | 2025-09-25 20:59:14 +0200 | [diff] [blame] | 167 | { |
| Marc Kupietz | 1bfd830 | 2025-09-25 21:14:45 +0200 | [diff] [blame] | 168 | result <- tidyllm::llm_message(prompt) |> |
| Marc Kupietz | 43bb93f | 2025-09-25 20:59:14 +0200 | [diff] [blame] | 169 | tidyllm::chat( |
| 170 | .provider = provider, |
| 171 | .model = model, |
| 172 | .temperature = temperature, |
| 173 | .timeout = timeout, |
| 174 | .max_tries = max_tries |
| 175 | ) |
| 176 | }, |
| Marc Kupietz | 1bfd830 | 2025-09-25 21:14:45 +0200 | [diff] [blame] | 177 | error = function(e) { |
| 178 | chat_err <<- conditionMessage(e) |
| 179 | } |
| Marc Kupietz | 43bb93f | 2025-09-25 20:59:14 +0200 | [diff] [blame] | 180 | ) |
| 181 | |
| Marc Kupietz | 1bfd830 | 2025-09-25 21:14:45 +0200 | [diff] [blame] | 182 | if (!is.null(chat_err)) { |
| 183 | message(sprintf( |
| 184 | "LLM request failed (model=%s, title=\"%s\", author=\"%s\"): %s", |
| 185 | model, as.character(title), as.character(author), chat_err |
| 186 | )) |
| Marc Kupietz | 43bb93f | 2025-09-25 20:59:14 +0200 | [diff] [blame] | 187 | return(tibble::tibble( |
| 188 | original_title = NA_character_, |
| 189 | original_publication_year = as.integer(NA), |
| 190 | orig_confidence = as.numeric(NA) |
| 191 | )) |
| 192 | } |
| 193 | |
| 194 | # Prefer JSON parsing via tidyllm helper; falls back to NA on failure |
| 195 | parsed <- NULL |
| 196 | suppressWarnings({ |
| 197 | parsed <- tidyllm::get_reply_data(result) |
| 198 | }) |
| 199 | |
| 200 | if (is.null(parsed)) { |
| 201 | # Try to extract JSON manually as a very last resort |
| 202 | raw <- tidyllm::get_reply(result) |
| 203 | json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}") |
| 204 | if (!is.na(json_txt)) { |
| 205 | parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE) |
| 206 | if (inherits(parsed, "try-error")) parsed <- NULL |
| 207 | } |
| 208 | } |
| 209 | |
| 210 | if (is.null(parsed)) { |
| 211 | return(tibble::tibble( |
| 212 | original_title = NA_character_, |
| 213 | original_publication_year = as.integer(NA), |
| 214 | orig_confidence = as.numeric(NA) |
| 215 | )) |
| 216 | } |
| 217 | |
| 218 | ot <- parsed[["original_title"]] |
| 219 | oy <- suppressWarnings(as.integer(parsed[["original_publication_year"]])) |
| 220 | cf <- suppressWarnings(as.numeric(parsed[["confidence"]])) |
| 221 | |
| 222 | if (!is.na(cf)) cf <- max(0, min(1, cf)) |
| 223 | |
| 224 | tibble::tibble( |
| 225 | original_title = dplyr::if_else(is.na(ot) | is.null(ot), NA_character_, as.character(ot)), |
| 226 | original_publication_year = dplyr::if_else(is.na(oy), as.integer(NA), as.integer(oy)), |
| 227 | orig_confidence = dplyr::if_else(is.na(cf), as.numeric(NA), as.numeric(cf)) |
| 228 | ) |
| 229 | } |
| 230 | |
| 231 | # Public API: augment an existing metadata df with original title/year (+ optional confidence) |
| 232 | augment_metadata_with_original <- function( |
| 233 | df, |
| 234 | provider = NULL, |
| Marc Kupietz | 1bfd830 | 2025-09-25 21:14:45 +0200 | [diff] [blame] | 235 | model = "gemini-2.5-pro", |
| Marc Kupietz | 43bb93f | 2025-09-25 20:59:14 +0200 | [diff] [blame] | 236 | include_confidence = TRUE, |
| 237 | temperature = 0.1, |
| 238 | max_tries = 3, |
| 239 | timeout = 60, |
| 240 | pause_seconds = 0, |
| 241 | check_api_key = TRUE, |
| 242 | show_progress = TRUE, |
| 243 | progress_style = 3) { |
| 244 | stopifnot(is.data.frame(df)) |
| 245 | |
| 246 | needed <- c("title", "author", "subTitle", "translator", "publisher", "pubDate", "ISBN") |
| 247 | missing <- setdiff(needed, names(df)) |
| 248 | if (length(missing) > 0) { |
| 249 | stop(sprintf("Missing required columns: %s", paste(missing, collapse = ", "))) |
| 250 | } |
| 251 | |
| 252 | # Friendly pre-check for required API key depending on the chosen model/provider |
| 253 | if (isTRUE(check_api_key)) { |
| 254 | ensure_api_key_for_model(model) |
| 255 | } |
| 256 | |
| 257 | # Resolve provider if not supplied |
| 258 | if (is.null(provider)) { |
| 259 | provider <- resolve_provider_for_model(model) |
| 260 | } |
| 261 | |
| 262 | # Prepare inputs and iterate using purrr::pmap_dfr to avoid NSE warnings |
| 263 | input_cols <- tibble::tibble( |
| 264 | title = df[["title"]], |
| 265 | author = df[["author"]], |
| 266 | subTitle = df[["subTitle"]], |
| 267 | translator = df[["translator"]], |
| 268 | publisher = df[["publisher"]], |
| 269 | pubDate = df[["pubDate"]], |
| 270 | ISBN = df[["ISBN"]] |
| 271 | ) |
| 272 | |
| 273 | infer_wrapper <- function(title, author, subTitle, translator, publisher, pubDate, ISBN) { |
| 274 | res <- infer_original_single( |
| 275 | title = title, |
| 276 | author = author, |
| 277 | subTitle = subTitle, |
| 278 | translator = translator, |
| 279 | publisher = publisher, |
| 280 | pubDate = pubDate, |
| 281 | ISBN = ISBN, |
| 282 | provider = provider, |
| 283 | model = model, |
| 284 | temperature = temperature, |
| 285 | max_tries = max_tries, |
| 286 | timeout = timeout |
| 287 | ) |
| 288 | if (pause_seconds > 0) Sys.sleep(pause_seconds) |
| 289 | res |
| 290 | } |
| 291 | |
| 292 | n <- nrow(input_cols) |
| 293 | results_list <- vector("list", n) |
| 294 | empty_res <- tibble::tibble( |
| 295 | original_title = NA_character_, |
| 296 | original_publication_year = as.integer(NA), |
| 297 | orig_confidence = as.numeric(NA) |
| 298 | ) |
| 299 | pb <- NULL |
| 300 | if (isTRUE(show_progress) && n > 0) { |
| 301 | pb <- utils::txtProgressBar(min = 0, max = n, style = progress_style) |
| 302 | } |
| 303 | for (i in seq_len(n)) { |
| 304 | res <- tryCatch( |
| 305 | infer_wrapper( |
| 306 | title = input_cols$title[i], |
| 307 | author = input_cols$author[i], |
| 308 | subTitle = input_cols$subTitle[i], |
| 309 | translator = input_cols$translator[i], |
| 310 | publisher = input_cols$publisher[i], |
| 311 | pubDate = input_cols$pubDate[i], |
| 312 | ISBN = input_cols$ISBN[i] |
| 313 | ), |
| Marc Kupietz | 1bfd830 | 2025-09-25 21:14:45 +0200 | [diff] [blame] | 314 | error = function(e) { |
| 315 | message(sprintf( |
| 316 | "Row %d failed (model=%s, title=\"%s\", author=\"%s\"): %s", |
| 317 | i, model, as.character(input_cols$title[i]), as.character(input_cols$author[i]), conditionMessage(e) |
| 318 | )) |
| 319 | empty_res |
| 320 | } |
| Marc Kupietz | 43bb93f | 2025-09-25 20:59:14 +0200 | [diff] [blame] | 321 | ) |
| 322 | if (is.null(res) || !is.data.frame(res) || nrow(res) < 1) { |
| 323 | res <- empty_res |
| 324 | } else if (nrow(res) > 1) { |
| 325 | res <- res[1, , drop = FALSE] |
| 326 | } |
| 327 | results_list[[i]] <- res |
| 328 | if (!is.null(pb)) utils::setTxtProgressBar(pb, i) |
| 329 | } |
| 330 | if (!is.null(pb)) close(pb) |
| 331 | results_tbl <- dplyr::bind_rows(results_list) |
| 332 | if (nrow(results_tbl) != n) { |
| 333 | # Ensure row count aligns with df to avoid recycling errors |
| 334 | if (nrow(results_tbl) < n) { |
| 335 | pad <- dplyr::bind_rows(rep(list(empty_res), n - nrow(results_tbl))) |
| 336 | results_tbl <- dplyr::bind_rows(results_tbl, pad) |
| 337 | } else if (nrow(results_tbl) > n) { |
| 338 | results_tbl <- dplyr::slice(results_tbl, seq_len(n)) |
| 339 | } |
| 340 | } |
| 341 | |
| 342 | out <- dplyr::bind_cols(df, results_tbl) |
| 343 | if (!include_confidence) out <- dplyr::select(out, -"orig_confidence") |
| 344 | out |
| 345 | } |
| 346 | |
| 347 | # df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro") |
| Marc Kupietz | 1bfd830 | 2025-09-25 21:14:45 +0200 | [diff] [blame] | 348 | df_aug <- augment_metadata_with_original(df, model = "deepseek-chat") |
| Marc Kupietz | 43bb93f | 2025-09-25 20:59:14 +0200 | [diff] [blame] | 349 | |
| 350 | # # OpenAI |
| 351 | # df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini") |
| 352 | |
| 353 | # # Claude |
| Marc Kupietz | 1bfd830 | 2025-09-25 21:14:45 +0200 | [diff] [blame] | 354 | # df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest") |
| Marc Kupietz | 43bb93f | 2025-09-25 20:59:14 +0200 | [diff] [blame] | 355 | |
| 356 | # # Gemini |
| 357 | # df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro") |