| library(RKorAPClient) |
| library(tidyverse) |
| library(urltools) |
| library(tidyllm) |
| |
| VC <- "author=/Dickens.*/" |
| deliko <- KorAPConnection("https://korap.dnb.de", verbose = TRUE) |
| query <- corpusQuery( |
| deliko, |
| "<base/s=t>", |
| # this finds each text once |
| vc = VC, |
| fields = c( |
| "textSigle", |
| "title", |
| "subTitle", |
| "author", |
| "translator", |
| "textType", |
| "textTypeRef", |
| "publisher", |
| "pubDate", |
| "pubPlace", |
| "ISBN", |
| "URN" |
| ) |
| ) %>% |
| fetchNext(maxFetch = 2, randomizePageOrder = TRUE) |
| |
| df <- query@collectedMatches |
| |
| df <- df %>% |
| mutate( |
| publisher = str_replace_all(publisher, "data:,", ""), |
| URN = URLdecode(str_replace_all(URN, ".*,", "")) |
| ) %>% |
| select(-c("matchStart", "matchEnd")) |
| |
| # write_tsv(df, "deliko_metadata.tsv") |
| |
| # Build a compact prompt requesting strict JSON for translation/original info |
| build_translation_prompt <- function( |
| title, |
| author = NA_character_, |
| subTitle = NA_character_, |
| translator = NA_character_, |
| publisher = NA_character_, |
| pubDate = NA_character_, |
| ISBN = NA_character_) { |
| fields <- c( |
| sprintf("Titel: %s", dplyr::coalesce(title, "")), |
| sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")), |
| sprintf("Autor/in: %s", dplyr::coalesce(author, "")), |
| sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")), |
| sprintf("Verlag: %s", dplyr::coalesce(publisher, "")), |
| sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")), |
| sprintf("ISBN: %s", dplyr::coalesce(ISBN, "")) |
| ) |
| context <- paste(fields, collapse = "\n") |
| paste0( |
| "Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:", |
| "\n1) Ob es eine Übersetzung ist.", |
| "\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.", |
| "\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:", |
| "\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string|null>,\n \"original_publication_year\": <integer|null>,\n \"confidence\": <number zwischen 0 und 1>\n}", |
| "\nNutze null für Unbekanntes. Antworte nur mit JSON.", |
| "\n\nMetadaten:\n", |
| context |
| ) |
| } |
| |
| # Resolve tidyllm provider from model prefix |
| resolve_provider_for_model <- function(model) { |
| if (grepl("^gpt-", model, ignore.case = TRUE)) { |
| return(tidyllm::openai()) |
| } else if (grepl("^claude-", model, ignore.case = TRUE)) { |
| return(tidyllm::claude()) |
| } else if (grepl("^gemini-", model, ignore.case = TRUE)) { |
| return(tidyllm::gemini()) |
| } else if (grepl("^deepseek", model, ignore.case = TRUE)) { |
| return(tidyllm::deepseek()) |
| } |
| # Default fallback |
| tidyllm::deepseek() |
| } |
| |
| # Friendly API key checks per provider family |
| ensure_api_key_for_model <- function(model) { |
| if (grepl("^gpt-", model, ignore.case = TRUE)) { |
| if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) { |
| stop("OPENAI_API_KEY is not set. Set it via Sys.setenv('OPENAI_API_KEY'='...') or source your shell env.") |
| } |
| } else if (grepl("^claude-", model, ignore.case = TRUE)) { |
| if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) { |
| stop("ANTHROPIC_API_KEY is not set. Set it via Sys.setenv('ANTHROPIC_API_KEY'='...') or source your shell env.") |
| } |
| } else if (grepl("^gemini-", model, ignore.case = TRUE)) { |
| if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) { |
| stop("GOOGLE_API_KEY is not set. Set it via Sys.setenv('GOOGLE_API_KEY'='...') or source your shell env.") |
| } |
| } else if (grepl("^deepseek", model, ignore.case = TRUE)) { |
| if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) { |
| stop("DEEPSEEK_API_KEY is not set. Set it via Sys.setenv('DEEPSEEK_API_KEY'='...') or source your shell env.") |
| } |
| } |
| invisible(TRUE) |
| } |
| |
| # Infer original title and year for a single record via LLM (DeepSeek by default) |
| infer_original_single <- function( |
| title, |
| author = NA_character_, |
| subTitle = NA_character_, |
| translator = NA_character_, |
| publisher = NA_character_, |
| pubDate = NA_character_, |
| ISBN = NA_character_, |
| provider = NULL, |
| model = "deepseek-chat", |
| temperature = 0.1, |
| max_tries = 3, |
| timeout = 60) { |
| prompt <- build_translation_prompt( |
| title = title, |
| author = author, |
| subTitle = subTitle, |
| translator = translator, |
| publisher = publisher, |
| pubDate = pubDate, |
| ISBN = ISBN |
| ) |
| |
| # Resolve provider if not supplied |
| if (is.null(provider)) { |
| provider <- resolve_provider_for_model(model) |
| } |
| |
| result <- try( |
| { |
| tidyllm::llm_message(prompt) |> |
| tidyllm::chat( |
| .provider = provider, |
| .model = model, |
| .temperature = temperature, |
| .timeout = timeout, |
| .max_tries = max_tries |
| ) |
| }, |
| silent = TRUE |
| ) |
| |
| if (inherits(result, "try-error")) { |
| return(tibble::tibble( |
| original_title = NA_character_, |
| original_publication_year = as.integer(NA), |
| orig_confidence = as.numeric(NA) |
| )) |
| } |
| |
| # Prefer JSON parsing via tidyllm helper; falls back to NA on failure |
| parsed <- NULL |
| suppressWarnings({ |
| parsed <- tidyllm::get_reply_data(result) |
| }) |
| |
| if (is.null(parsed)) { |
| # Try to extract JSON manually as a very last resort |
| raw <- tidyllm::get_reply(result) |
| json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}") |
| if (!is.na(json_txt)) { |
| parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE) |
| if (inherits(parsed, "try-error")) parsed <- NULL |
| } |
| } |
| |
| if (is.null(parsed)) { |
| return(tibble::tibble( |
| original_title = NA_character_, |
| original_publication_year = as.integer(NA), |
| orig_confidence = as.numeric(NA) |
| )) |
| } |
| |
| ot <- parsed[["original_title"]] |
| oy <- suppressWarnings(as.integer(parsed[["original_publication_year"]])) |
| cf <- suppressWarnings(as.numeric(parsed[["confidence"]])) |
| |
| if (!is.na(cf)) cf <- max(0, min(1, cf)) |
| |
| tibble::tibble( |
| original_title = dplyr::if_else(is.na(ot) | is.null(ot), NA_character_, as.character(ot)), |
| original_publication_year = dplyr::if_else(is.na(oy), as.integer(NA), as.integer(oy)), |
| orig_confidence = dplyr::if_else(is.na(cf), as.numeric(NA), as.numeric(cf)) |
| ) |
| } |
| |
| # Public API: augment an existing metadata df with original title/year (+ optional confidence) |
| augment_metadata_with_original <- function( |
| df, |
| provider = NULL, |
| model = "deepseek-chat", |
| include_confidence = TRUE, |
| temperature = 0.1, |
| max_tries = 3, |
| timeout = 60, |
| pause_seconds = 0, |
| check_api_key = TRUE, |
| show_progress = TRUE, |
| progress_style = 3) { |
| stopifnot(is.data.frame(df)) |
| |
| needed <- c("title", "author", "subTitle", "translator", "publisher", "pubDate", "ISBN") |
| missing <- setdiff(needed, names(df)) |
| if (length(missing) > 0) { |
| stop(sprintf("Missing required columns: %s", paste(missing, collapse = ", "))) |
| } |
| |
| # Friendly pre-check for required API key depending on the chosen model/provider |
| if (isTRUE(check_api_key)) { |
| ensure_api_key_for_model(model) |
| } |
| |
| # Resolve provider if not supplied |
| if (is.null(provider)) { |
| provider <- resolve_provider_for_model(model) |
| } |
| |
| # Prepare inputs and iterate using purrr::pmap_dfr to avoid NSE warnings |
| input_cols <- tibble::tibble( |
| title = df[["title"]], |
| author = df[["author"]], |
| subTitle = df[["subTitle"]], |
| translator = df[["translator"]], |
| publisher = df[["publisher"]], |
| pubDate = df[["pubDate"]], |
| ISBN = df[["ISBN"]] |
| ) |
| |
| infer_wrapper <- function(title, author, subTitle, translator, publisher, pubDate, ISBN) { |
| res <- infer_original_single( |
| title = title, |
| author = author, |
| subTitle = subTitle, |
| translator = translator, |
| publisher = publisher, |
| pubDate = pubDate, |
| ISBN = ISBN, |
| provider = provider, |
| model = model, |
| temperature = temperature, |
| max_tries = max_tries, |
| timeout = timeout |
| ) |
| if (pause_seconds > 0) Sys.sleep(pause_seconds) |
| res |
| } |
| |
| n <- nrow(input_cols) |
| results_list <- vector("list", n) |
| empty_res <- tibble::tibble( |
| original_title = NA_character_, |
| original_publication_year = as.integer(NA), |
| orig_confidence = as.numeric(NA) |
| ) |
| pb <- NULL |
| if (isTRUE(show_progress) && n > 0) { |
| pb <- utils::txtProgressBar(min = 0, max = n, style = progress_style) |
| } |
| for (i in seq_len(n)) { |
| res <- tryCatch( |
| infer_wrapper( |
| title = input_cols$title[i], |
| author = input_cols$author[i], |
| subTitle = input_cols$subTitle[i], |
| translator = input_cols$translator[i], |
| publisher = input_cols$publisher[i], |
| pubDate = input_cols$pubDate[i], |
| ISBN = input_cols$ISBN[i] |
| ), |
| error = function(e) empty_res |
| ) |
| if (is.null(res) || !is.data.frame(res) || nrow(res) < 1) { |
| res <- empty_res |
| } else if (nrow(res) > 1) { |
| res <- res[1, , drop = FALSE] |
| } |
| results_list[[i]] <- res |
| if (!is.null(pb)) utils::setTxtProgressBar(pb, i) |
| } |
| if (!is.null(pb)) close(pb) |
| results_tbl <- dplyr::bind_rows(results_list) |
| if (nrow(results_tbl) != n) { |
| # Ensure row count aligns with df to avoid recycling errors |
| if (nrow(results_tbl) < n) { |
| pad <- dplyr::bind_rows(rep(list(empty_res), n - nrow(results_tbl))) |
| results_tbl <- dplyr::bind_rows(results_tbl, pad) |
| } else if (nrow(results_tbl) > n) { |
| results_tbl <- dplyr::slice(results_tbl, seq_len(n)) |
| } |
| } |
| |
| out <- dplyr::bind_cols(df, results_tbl) |
| if (!include_confidence) out <- dplyr::select(out, -"orig_confidence") |
| out |
| } |
| |
| # df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro") |
| df_aug <- augment_metadata_with_original(df, model = "deepseek") |
| |
| # # OpenAI |
| # df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini") |
| |
| # # Claude |
| #df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest") |
| |
| # # Gemini |
| # df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro") |