blob: 06ca031bd0be21210a2560b277b74ded2c4db7a1 [file] [log] [blame]
Marc Kupietz43bb93f2025-09-25 20:59:14 +02001library(RKorAPClient)
2library(tidyverse)
3library(urltools)
4library(tidyllm)
5
6VC <- "author=/Dickens.*/"
7deliko <- KorAPConnection("https://korap.dnb.de", verbose = TRUE)
8query <- corpusQuery(
9 deliko,
10 "<base/s=t>",
11 # this finds each text once
12 vc = VC,
13 fields = c(
14 "textSigle",
15 "title",
16 "subTitle",
17 "author",
18 "translator",
19 "textType",
20 "textTypeRef",
21 "publisher",
22 "pubDate",
23 "pubPlace",
24 "ISBN",
25 "URN"
26 )
27) %>%
28 fetchNext(maxFetch = 2, randomizePageOrder = TRUE)
29
30df <- query@collectedMatches
31
32df <- df %>%
33 mutate(
34 publisher = str_replace_all(publisher, "data:,", ""),
35 URN = URLdecode(str_replace_all(URN, ".*,", ""))
36 ) %>%
37 select(-c("matchStart", "matchEnd"))
38
39# write_tsv(df, "deliko_metadata.tsv")
40
41# Build a compact prompt requesting strict JSON for translation/original info
42build_translation_prompt <- function(
43 title,
44 author = NA_character_,
45 subTitle = NA_character_,
46 translator = NA_character_,
47 publisher = NA_character_,
48 pubDate = NA_character_,
Marc Kupietz1bfd8302025-09-25 21:14:45 +020049 ISBN = NA_character_,
50 prompt_language = c("en", "de")) {
51 prompt_language <- match.arg(prompt_language)
52 if (prompt_language == "de") {
53 fields <- c(
54 sprintf("Titel: %s", dplyr::coalesce(title, "")),
55 sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")),
56 sprintf("Autor/in: %s", dplyr::coalesce(author, "")),
57 sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")),
58 sprintf("Verlag: %s", dplyr::coalesce(publisher, "")),
59 sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")),
60 sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
61 )
62 context <- paste(fields, collapse = "\n")
63 paste0(
64 "Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:",
65 "\n1) Ob es eine Übersetzung ist.",
66 "\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.",
67 "\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:",
68 "\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string|null>,\n \"original_publication_year\": <integer|null>,\n \"confidence\": <number zwischen 0 und 1>\n}",
69 "\nNutze null für Unbekanntes. Antworte nur mit JSON.",
70 "\n\nMetadaten:\n",
71 context
72 )
73 } else {
74 fields <- c(
75 sprintf("Title: %s", dplyr::coalesce(title, "")),
76 sprintf("Subtitle: %s", dplyr::coalesce(subTitle, "")),
77 sprintf("Author: %s", dplyr::coalesce(author, "")),
78 sprintf("Translator: %s", dplyr::coalesce(translator, "")),
79 sprintf("Publisher: %s", dplyr::coalesce(publisher, "")),
80 sprintf("Publication year (German edition): %s", dplyr::coalesce(pubDate, "")),
81 sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
82 )
83 context <- paste(fields, collapse = "\n")
84 paste0(
85 "You are a bibliographic assistant. For the following German-language book, determine:",
86 "\n1) Whether it is a translation.",
87 "\n2) If yes: the original title and the year of first publication.",
88 "\n\nReturn a single JSON object only — no prose — with exactly these keys:",
89 "\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string|null>,\n \"original_publication_year\": <integer|null>,\n \"confidence\": <number between 0 and 1>\n}",
90 "\nUse null for unknown. Output JSON only.",
91 "\n\nMetadata:\n",
92 context
93 )
94 }
Marc Kupietz43bb93f2025-09-25 20:59:14 +020095}
96
97# Resolve tidyllm provider from model prefix
98resolve_provider_for_model <- function(model) {
99 if (grepl("^gpt-", model, ignore.case = TRUE)) {
100 return(tidyllm::openai())
101 } else if (grepl("^claude-", model, ignore.case = TRUE)) {
102 return(tidyllm::claude())
103 } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
104 return(tidyllm::gemini())
105 } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
106 return(tidyllm::deepseek())
107 }
108 # Default fallback
109 tidyllm::deepseek()
110}
111
112# Friendly API key checks per provider family
113ensure_api_key_for_model <- function(model) {
114 if (grepl("^gpt-", model, ignore.case = TRUE)) {
115 if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) {
116 stop("OPENAI_API_KEY is not set. Set it via Sys.setenv('OPENAI_API_KEY'='...') or source your shell env.")
117 }
118 } else if (grepl("^claude-", model, ignore.case = TRUE)) {
119 if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) {
120 stop("ANTHROPIC_API_KEY is not set. Set it via Sys.setenv('ANTHROPIC_API_KEY'='...') or source your shell env.")
121 }
122 } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
123 if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) {
124 stop("GOOGLE_API_KEY is not set. Set it via Sys.setenv('GOOGLE_API_KEY'='...') or source your shell env.")
125 }
126 } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
127 if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) {
128 stop("DEEPSEEK_API_KEY is not set. Set it via Sys.setenv('DEEPSEEK_API_KEY'='...') or source your shell env.")
129 }
130 }
131 invisible(TRUE)
132}
133
134# Infer original title and year for a single record via LLM (DeepSeek by default)
135infer_original_single <- function(
136 title,
137 author = NA_character_,
138 subTitle = NA_character_,
139 translator = NA_character_,
140 publisher = NA_character_,
141 pubDate = NA_character_,
142 ISBN = NA_character_,
143 provider = NULL,
144 model = "deepseek-chat",
145 temperature = 0.1,
146 max_tries = 3,
147 timeout = 60) {
148 prompt <- build_translation_prompt(
149 title = title,
150 author = author,
151 subTitle = subTitle,
152 translator = translator,
153 publisher = publisher,
154 pubDate = pubDate,
Marc Kupietz1bfd8302025-09-25 21:14:45 +0200155 ISBN = ISBN,
156 prompt_language = "en"
Marc Kupietz43bb93f2025-09-25 20:59:14 +0200157 )
158
159 # Resolve provider if not supplied
160 if (is.null(provider)) {
161 provider <- resolve_provider_for_model(model)
162 }
163
Marc Kupietz1bfd8302025-09-25 21:14:45 +0200164 result <- NULL
165 chat_err <- NULL
166 tryCatch(
Marc Kupietz43bb93f2025-09-25 20:59:14 +0200167 {
Marc Kupietz1bfd8302025-09-25 21:14:45 +0200168 result <- tidyllm::llm_message(prompt) |>
Marc Kupietz43bb93f2025-09-25 20:59:14 +0200169 tidyllm::chat(
170 .provider = provider,
171 .model = model,
172 .temperature = temperature,
173 .timeout = timeout,
174 .max_tries = max_tries
175 )
176 },
Marc Kupietz1bfd8302025-09-25 21:14:45 +0200177 error = function(e) {
178 chat_err <<- conditionMessage(e)
179 }
Marc Kupietz43bb93f2025-09-25 20:59:14 +0200180 )
181
Marc Kupietz1bfd8302025-09-25 21:14:45 +0200182 if (!is.null(chat_err)) {
183 message(sprintf(
184 "LLM request failed (model=%s, title=\"%s\", author=\"%s\"): %s",
185 model, as.character(title), as.character(author), chat_err
186 ))
Marc Kupietz43bb93f2025-09-25 20:59:14 +0200187 return(tibble::tibble(
188 original_title = NA_character_,
189 original_publication_year = as.integer(NA),
190 orig_confidence = as.numeric(NA)
191 ))
192 }
193
194 # Prefer JSON parsing via tidyllm helper; falls back to NA on failure
195 parsed <- NULL
196 suppressWarnings({
197 parsed <- tidyllm::get_reply_data(result)
198 })
199
200 if (is.null(parsed)) {
201 # Try to extract JSON manually as a very last resort
202 raw <- tidyllm::get_reply(result)
203 json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}")
204 if (!is.na(json_txt)) {
205 parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE)
206 if (inherits(parsed, "try-error")) parsed <- NULL
207 }
208 }
209
210 if (is.null(parsed)) {
211 return(tibble::tibble(
212 original_title = NA_character_,
213 original_publication_year = as.integer(NA),
214 orig_confidence = as.numeric(NA)
215 ))
216 }
217
218 ot <- parsed[["original_title"]]
219 oy <- suppressWarnings(as.integer(parsed[["original_publication_year"]]))
220 cf <- suppressWarnings(as.numeric(parsed[["confidence"]]))
221
222 if (!is.na(cf)) cf <- max(0, min(1, cf))
223
224 tibble::tibble(
225 original_title = dplyr::if_else(is.na(ot) | is.null(ot), NA_character_, as.character(ot)),
226 original_publication_year = dplyr::if_else(is.na(oy), as.integer(NA), as.integer(oy)),
227 orig_confidence = dplyr::if_else(is.na(cf), as.numeric(NA), as.numeric(cf))
228 )
229}
230
231# Public API: augment an existing metadata df with original title/year (+ optional confidence)
232augment_metadata_with_original <- function(
233 df,
234 provider = NULL,
Marc Kupietz1bfd8302025-09-25 21:14:45 +0200235 model = "gemini-2.5-pro",
Marc Kupietz43bb93f2025-09-25 20:59:14 +0200236 include_confidence = TRUE,
237 temperature = 0.1,
238 max_tries = 3,
239 timeout = 60,
240 pause_seconds = 0,
241 check_api_key = TRUE,
242 show_progress = TRUE,
243 progress_style = 3) {
244 stopifnot(is.data.frame(df))
245
246 needed <- c("title", "author", "subTitle", "translator", "publisher", "pubDate", "ISBN")
247 missing <- setdiff(needed, names(df))
248 if (length(missing) > 0) {
249 stop(sprintf("Missing required columns: %s", paste(missing, collapse = ", ")))
250 }
251
252 # Friendly pre-check for required API key depending on the chosen model/provider
253 if (isTRUE(check_api_key)) {
254 ensure_api_key_for_model(model)
255 }
256
257 # Resolve provider if not supplied
258 if (is.null(provider)) {
259 provider <- resolve_provider_for_model(model)
260 }
261
262 # Prepare inputs and iterate using purrr::pmap_dfr to avoid NSE warnings
263 input_cols <- tibble::tibble(
264 title = df[["title"]],
265 author = df[["author"]],
266 subTitle = df[["subTitle"]],
267 translator = df[["translator"]],
268 publisher = df[["publisher"]],
269 pubDate = df[["pubDate"]],
270 ISBN = df[["ISBN"]]
271 )
272
273 infer_wrapper <- function(title, author, subTitle, translator, publisher, pubDate, ISBN) {
274 res <- infer_original_single(
275 title = title,
276 author = author,
277 subTitle = subTitle,
278 translator = translator,
279 publisher = publisher,
280 pubDate = pubDate,
281 ISBN = ISBN,
282 provider = provider,
283 model = model,
284 temperature = temperature,
285 max_tries = max_tries,
286 timeout = timeout
287 )
288 if (pause_seconds > 0) Sys.sleep(pause_seconds)
289 res
290 }
291
292 n <- nrow(input_cols)
293 results_list <- vector("list", n)
294 empty_res <- tibble::tibble(
295 original_title = NA_character_,
296 original_publication_year = as.integer(NA),
297 orig_confidence = as.numeric(NA)
298 )
299 pb <- NULL
300 if (isTRUE(show_progress) && n > 0) {
301 pb <- utils::txtProgressBar(min = 0, max = n, style = progress_style)
302 }
303 for (i in seq_len(n)) {
304 res <- tryCatch(
305 infer_wrapper(
306 title = input_cols$title[i],
307 author = input_cols$author[i],
308 subTitle = input_cols$subTitle[i],
309 translator = input_cols$translator[i],
310 publisher = input_cols$publisher[i],
311 pubDate = input_cols$pubDate[i],
312 ISBN = input_cols$ISBN[i]
313 ),
Marc Kupietz1bfd8302025-09-25 21:14:45 +0200314 error = function(e) {
315 message(sprintf(
316 "Row %d failed (model=%s, title=\"%s\", author=\"%s\"): %s",
317 i, model, as.character(input_cols$title[i]), as.character(input_cols$author[i]), conditionMessage(e)
318 ))
319 empty_res
320 }
Marc Kupietz43bb93f2025-09-25 20:59:14 +0200321 )
322 if (is.null(res) || !is.data.frame(res) || nrow(res) < 1) {
323 res <- empty_res
324 } else if (nrow(res) > 1) {
325 res <- res[1, , drop = FALSE]
326 }
327 results_list[[i]] <- res
328 if (!is.null(pb)) utils::setTxtProgressBar(pb, i)
329 }
330 if (!is.null(pb)) close(pb)
331 results_tbl <- dplyr::bind_rows(results_list)
332 if (nrow(results_tbl) != n) {
333 # Ensure row count aligns with df to avoid recycling errors
334 if (nrow(results_tbl) < n) {
335 pad <- dplyr::bind_rows(rep(list(empty_res), n - nrow(results_tbl)))
336 results_tbl <- dplyr::bind_rows(results_tbl, pad)
337 } else if (nrow(results_tbl) > n) {
338 results_tbl <- dplyr::slice(results_tbl, seq_len(n))
339 }
340 }
341
342 out <- dplyr::bind_cols(df, results_tbl)
343 if (!include_confidence) out <- dplyr::select(out, -"orig_confidence")
344 out
345}
346
347# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
Marc Kupietz1bfd8302025-09-25 21:14:45 +0200348df_aug <- augment_metadata_with_original(df, model = "deepseek-chat")
Marc Kupietz43bb93f2025-09-25 20:59:14 +0200349
350# # OpenAI
351# df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini")
352
353# # Claude
Marc Kupietz1bfd8302025-09-25 21:14:45 +0200354# df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest")
Marc Kupietz43bb93f2025-09-25 20:59:14 +0200355
356# # Gemini
357# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")