blob: f064fa5a9eba23aa986ac374b4f6d8d700fc26a2 [file] [log] [blame]
library(RKorAPClient)
library(tidyverse)
library(urltools)
library(tidyllm)
VC <- "author=/Dickens.*/"
deliko <- KorAPConnection("https://korap.dnb.de", verbose = TRUE)
query <- corpusQuery(
deliko,
"<base/s=t>",
# this finds each text once
vc = VC,
fields = c(
"textSigle",
"title",
"subTitle",
"author",
"translator",
"textType",
"textTypeRef",
"publisher",
"pubDate",
"pubPlace",
"ISBN",
"URN"
)
) %>%
fetchNext(maxFetch = 2, randomizePageOrder = TRUE)
df <- query@collectedMatches
df <- df %>%
mutate(
publisher = str_replace_all(publisher, "data:,", ""),
URN = URLdecode(str_replace_all(URN, ".*,", ""))
) %>%
select(-c("matchStart", "matchEnd"))
# write_tsv(df, "deliko_metadata.tsv")
# Build a compact prompt requesting strict JSON for translation/original info
build_translation_prompt <- function(
title,
author = NA_character_,
subTitle = NA_character_,
translator = NA_character_,
publisher = NA_character_,
pubDate = NA_character_,
ISBN = NA_character_) {
fields <- c(
sprintf("Titel: %s", dplyr::coalesce(title, "")),
sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")),
sprintf("Autor/in: %s", dplyr::coalesce(author, "")),
sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")),
sprintf("Verlag: %s", dplyr::coalesce(publisher, "")),
sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")),
sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
)
context <- paste(fields, collapse = "\n")
paste0(
"Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:",
"\n1) Ob es eine Übersetzung ist.",
"\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.",
"\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:",
"\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string|null>,\n \"original_publication_year\": <integer|null>,\n \"confidence\": <number zwischen 0 und 1>\n}",
"\nNutze null für Unbekanntes. Antworte nur mit JSON.",
"\n\nMetadaten:\n",
context
)
}
# Resolve tidyllm provider from model prefix
resolve_provider_for_model <- function(model) {
if (grepl("^gpt-", model, ignore.case = TRUE)) {
return(tidyllm::openai())
} else if (grepl("^claude-", model, ignore.case = TRUE)) {
return(tidyllm::claude())
} else if (grepl("^gemini-", model, ignore.case = TRUE)) {
return(tidyllm::gemini())
} else if (grepl("^deepseek", model, ignore.case = TRUE)) {
return(tidyllm::deepseek())
}
# Default fallback
tidyllm::deepseek()
}
# Friendly API key checks per provider family
ensure_api_key_for_model <- function(model) {
if (grepl("^gpt-", model, ignore.case = TRUE)) {
if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) {
stop("OPENAI_API_KEY is not set. Set it via Sys.setenv('OPENAI_API_KEY'='...') or source your shell env.")
}
} else if (grepl("^claude-", model, ignore.case = TRUE)) {
if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) {
stop("ANTHROPIC_API_KEY is not set. Set it via Sys.setenv('ANTHROPIC_API_KEY'='...') or source your shell env.")
}
} else if (grepl("^gemini-", model, ignore.case = TRUE)) {
if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) {
stop("GOOGLE_API_KEY is not set. Set it via Sys.setenv('GOOGLE_API_KEY'='...') or source your shell env.")
}
} else if (grepl("^deepseek", model, ignore.case = TRUE)) {
if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) {
stop("DEEPSEEK_API_KEY is not set. Set it via Sys.setenv('DEEPSEEK_API_KEY'='...') or source your shell env.")
}
}
invisible(TRUE)
}
# Infer original title and year for a single record via LLM (DeepSeek by default)
infer_original_single <- function(
title,
author = NA_character_,
subTitle = NA_character_,
translator = NA_character_,
publisher = NA_character_,
pubDate = NA_character_,
ISBN = NA_character_,
provider = NULL,
model = "deepseek-chat",
temperature = 0.1,
max_tries = 3,
timeout = 60) {
prompt <- build_translation_prompt(
title = title,
author = author,
subTitle = subTitle,
translator = translator,
publisher = publisher,
pubDate = pubDate,
ISBN = ISBN
)
# Resolve provider if not supplied
if (is.null(provider)) {
provider <- resolve_provider_for_model(model)
}
result <- try(
{
tidyllm::llm_message(prompt) |>
tidyllm::chat(
.provider = provider,
.model = model,
.temperature = temperature,
.timeout = timeout,
.max_tries = max_tries
)
},
silent = TRUE
)
if (inherits(result, "try-error")) {
return(tibble::tibble(
original_title = NA_character_,
original_publication_year = as.integer(NA),
orig_confidence = as.numeric(NA)
))
}
# Prefer JSON parsing via tidyllm helper; falls back to NA on failure
parsed <- NULL
suppressWarnings({
parsed <- tidyllm::get_reply_data(result)
})
if (is.null(parsed)) {
# Try to extract JSON manually as a very last resort
raw <- tidyllm::get_reply(result)
json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}")
if (!is.na(json_txt)) {
parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE)
if (inherits(parsed, "try-error")) parsed <- NULL
}
}
if (is.null(parsed)) {
return(tibble::tibble(
original_title = NA_character_,
original_publication_year = as.integer(NA),
orig_confidence = as.numeric(NA)
))
}
ot <- parsed[["original_title"]]
oy <- suppressWarnings(as.integer(parsed[["original_publication_year"]]))
cf <- suppressWarnings(as.numeric(parsed[["confidence"]]))
if (!is.na(cf)) cf <- max(0, min(1, cf))
tibble::tibble(
original_title = dplyr::if_else(is.na(ot) | is.null(ot), NA_character_, as.character(ot)),
original_publication_year = dplyr::if_else(is.na(oy), as.integer(NA), as.integer(oy)),
orig_confidence = dplyr::if_else(is.na(cf), as.numeric(NA), as.numeric(cf))
)
}
# Public API: augment an existing metadata df with original title/year (+ optional confidence)
augment_metadata_with_original <- function(
df,
provider = NULL,
model = "deepseek-chat",
include_confidence = TRUE,
temperature = 0.1,
max_tries = 3,
timeout = 60,
pause_seconds = 0,
check_api_key = TRUE,
show_progress = TRUE,
progress_style = 3) {
stopifnot(is.data.frame(df))
needed <- c("title", "author", "subTitle", "translator", "publisher", "pubDate", "ISBN")
missing <- setdiff(needed, names(df))
if (length(missing) > 0) {
stop(sprintf("Missing required columns: %s", paste(missing, collapse = ", ")))
}
# Friendly pre-check for required API key depending on the chosen model/provider
if (isTRUE(check_api_key)) {
ensure_api_key_for_model(model)
}
# Resolve provider if not supplied
if (is.null(provider)) {
provider <- resolve_provider_for_model(model)
}
# Prepare inputs and iterate using purrr::pmap_dfr to avoid NSE warnings
input_cols <- tibble::tibble(
title = df[["title"]],
author = df[["author"]],
subTitle = df[["subTitle"]],
translator = df[["translator"]],
publisher = df[["publisher"]],
pubDate = df[["pubDate"]],
ISBN = df[["ISBN"]]
)
infer_wrapper <- function(title, author, subTitle, translator, publisher, pubDate, ISBN) {
res <- infer_original_single(
title = title,
author = author,
subTitle = subTitle,
translator = translator,
publisher = publisher,
pubDate = pubDate,
ISBN = ISBN,
provider = provider,
model = model,
temperature = temperature,
max_tries = max_tries,
timeout = timeout
)
if (pause_seconds > 0) Sys.sleep(pause_seconds)
res
}
n <- nrow(input_cols)
results_list <- vector("list", n)
empty_res <- tibble::tibble(
original_title = NA_character_,
original_publication_year = as.integer(NA),
orig_confidence = as.numeric(NA)
)
pb <- NULL
if (isTRUE(show_progress) && n > 0) {
pb <- utils::txtProgressBar(min = 0, max = n, style = progress_style)
}
for (i in seq_len(n)) {
res <- tryCatch(
infer_wrapper(
title = input_cols$title[i],
author = input_cols$author[i],
subTitle = input_cols$subTitle[i],
translator = input_cols$translator[i],
publisher = input_cols$publisher[i],
pubDate = input_cols$pubDate[i],
ISBN = input_cols$ISBN[i]
),
error = function(e) empty_res
)
if (is.null(res) || !is.data.frame(res) || nrow(res) < 1) {
res <- empty_res
} else if (nrow(res) > 1) {
res <- res[1, , drop = FALSE]
}
results_list[[i]] <- res
if (!is.null(pb)) utils::setTxtProgressBar(pb, i)
}
if (!is.null(pb)) close(pb)
results_tbl <- dplyr::bind_rows(results_list)
if (nrow(results_tbl) != n) {
# Ensure row count aligns with df to avoid recycling errors
if (nrow(results_tbl) < n) {
pad <- dplyr::bind_rows(rep(list(empty_res), n - nrow(results_tbl)))
results_tbl <- dplyr::bind_rows(results_tbl, pad)
} else if (nrow(results_tbl) > n) {
results_tbl <- dplyr::slice(results_tbl, seq_len(n))
}
}
out <- dplyr::bind_cols(df, results_tbl)
if (!include_confidence) out <- dplyr::select(out, -"orig_confidence")
out
}
# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
df_aug <- augment_metadata_with_original(df, model = "deepseek")
# # OpenAI
# df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini")
# # Claude
#df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest")
# # Gemini
# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")