blob: f064fa5a9eba23aa986ac374b4f6d8d700fc26a2 [file] [log] [blame]
Marc Kupietz43bb93f2025-09-25 20:59:14 +02001library(RKorAPClient)
2library(tidyverse)
3library(urltools)
4library(tidyllm)
5
6VC <- "author=/Dickens.*/"
7deliko <- KorAPConnection("https://korap.dnb.de", verbose = TRUE)
8query <- corpusQuery(
9 deliko,
10 "<base/s=t>",
11 # this finds each text once
12 vc = VC,
13 fields = c(
14 "textSigle",
15 "title",
16 "subTitle",
17 "author",
18 "translator",
19 "textType",
20 "textTypeRef",
21 "publisher",
22 "pubDate",
23 "pubPlace",
24 "ISBN",
25 "URN"
26 )
27) %>%
28 fetchNext(maxFetch = 2, randomizePageOrder = TRUE)
29
30df <- query@collectedMatches
31
32df <- df %>%
33 mutate(
34 publisher = str_replace_all(publisher, "data:,", ""),
35 URN = URLdecode(str_replace_all(URN, ".*,", ""))
36 ) %>%
37 select(-c("matchStart", "matchEnd"))
38
39# write_tsv(df, "deliko_metadata.tsv")
40
41# Build a compact prompt requesting strict JSON for translation/original info
42build_translation_prompt <- function(
43 title,
44 author = NA_character_,
45 subTitle = NA_character_,
46 translator = NA_character_,
47 publisher = NA_character_,
48 pubDate = NA_character_,
49 ISBN = NA_character_) {
50 fields <- c(
51 sprintf("Titel: %s", dplyr::coalesce(title, "")),
52 sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")),
53 sprintf("Autor/in: %s", dplyr::coalesce(author, "")),
54 sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")),
55 sprintf("Verlag: %s", dplyr::coalesce(publisher, "")),
56 sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")),
57 sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
58 )
59 context <- paste(fields, collapse = "\n")
60 paste0(
61 "Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:",
62 "\n1) Ob es eine Übersetzung ist.",
63 "\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.",
64 "\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:",
65 "\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string|null>,\n \"original_publication_year\": <integer|null>,\n \"confidence\": <number zwischen 0 und 1>\n}",
66 "\nNutze null für Unbekanntes. Antworte nur mit JSON.",
67 "\n\nMetadaten:\n",
68 context
69 )
70}
71
72# Resolve tidyllm provider from model prefix
73resolve_provider_for_model <- function(model) {
74 if (grepl("^gpt-", model, ignore.case = TRUE)) {
75 return(tidyllm::openai())
76 } else if (grepl("^claude-", model, ignore.case = TRUE)) {
77 return(tidyllm::claude())
78 } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
79 return(tidyllm::gemini())
80 } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
81 return(tidyllm::deepseek())
82 }
83 # Default fallback
84 tidyllm::deepseek()
85}
86
87# Friendly API key checks per provider family
88ensure_api_key_for_model <- function(model) {
89 if (grepl("^gpt-", model, ignore.case = TRUE)) {
90 if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) {
91 stop("OPENAI_API_KEY is not set. Set it via Sys.setenv('OPENAI_API_KEY'='...') or source your shell env.")
92 }
93 } else if (grepl("^claude-", model, ignore.case = TRUE)) {
94 if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) {
95 stop("ANTHROPIC_API_KEY is not set. Set it via Sys.setenv('ANTHROPIC_API_KEY'='...') or source your shell env.")
96 }
97 } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
98 if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) {
99 stop("GOOGLE_API_KEY is not set. Set it via Sys.setenv('GOOGLE_API_KEY'='...') or source your shell env.")
100 }
101 } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
102 if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) {
103 stop("DEEPSEEK_API_KEY is not set. Set it via Sys.setenv('DEEPSEEK_API_KEY'='...') or source your shell env.")
104 }
105 }
106 invisible(TRUE)
107}
108
109# Infer original title and year for a single record via LLM (DeepSeek by default)
110infer_original_single <- function(
111 title,
112 author = NA_character_,
113 subTitle = NA_character_,
114 translator = NA_character_,
115 publisher = NA_character_,
116 pubDate = NA_character_,
117 ISBN = NA_character_,
118 provider = NULL,
119 model = "deepseek-chat",
120 temperature = 0.1,
121 max_tries = 3,
122 timeout = 60) {
123 prompt <- build_translation_prompt(
124 title = title,
125 author = author,
126 subTitle = subTitle,
127 translator = translator,
128 publisher = publisher,
129 pubDate = pubDate,
130 ISBN = ISBN
131 )
132
133 # Resolve provider if not supplied
134 if (is.null(provider)) {
135 provider <- resolve_provider_for_model(model)
136 }
137
138 result <- try(
139 {
140 tidyllm::llm_message(prompt) |>
141 tidyllm::chat(
142 .provider = provider,
143 .model = model,
144 .temperature = temperature,
145 .timeout = timeout,
146 .max_tries = max_tries
147 )
148 },
149 silent = TRUE
150 )
151
152 if (inherits(result, "try-error")) {
153 return(tibble::tibble(
154 original_title = NA_character_,
155 original_publication_year = as.integer(NA),
156 orig_confidence = as.numeric(NA)
157 ))
158 }
159
160 # Prefer JSON parsing via tidyllm helper; falls back to NA on failure
161 parsed <- NULL
162 suppressWarnings({
163 parsed <- tidyllm::get_reply_data(result)
164 })
165
166 if (is.null(parsed)) {
167 # Try to extract JSON manually as a very last resort
168 raw <- tidyllm::get_reply(result)
169 json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}")
170 if (!is.na(json_txt)) {
171 parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE)
172 if (inherits(parsed, "try-error")) parsed <- NULL
173 }
174 }
175
176 if (is.null(parsed)) {
177 return(tibble::tibble(
178 original_title = NA_character_,
179 original_publication_year = as.integer(NA),
180 orig_confidence = as.numeric(NA)
181 ))
182 }
183
184 ot <- parsed[["original_title"]]
185 oy <- suppressWarnings(as.integer(parsed[["original_publication_year"]]))
186 cf <- suppressWarnings(as.numeric(parsed[["confidence"]]))
187
188 if (!is.na(cf)) cf <- max(0, min(1, cf))
189
190 tibble::tibble(
191 original_title = dplyr::if_else(is.na(ot) | is.null(ot), NA_character_, as.character(ot)),
192 original_publication_year = dplyr::if_else(is.na(oy), as.integer(NA), as.integer(oy)),
193 orig_confidence = dplyr::if_else(is.na(cf), as.numeric(NA), as.numeric(cf))
194 )
195}
196
197# Public API: augment an existing metadata df with original title/year (+ optional confidence)
198augment_metadata_with_original <- function(
199 df,
200 provider = NULL,
201 model = "deepseek-chat",
202 include_confidence = TRUE,
203 temperature = 0.1,
204 max_tries = 3,
205 timeout = 60,
206 pause_seconds = 0,
207 check_api_key = TRUE,
208 show_progress = TRUE,
209 progress_style = 3) {
210 stopifnot(is.data.frame(df))
211
212 needed <- c("title", "author", "subTitle", "translator", "publisher", "pubDate", "ISBN")
213 missing <- setdiff(needed, names(df))
214 if (length(missing) > 0) {
215 stop(sprintf("Missing required columns: %s", paste(missing, collapse = ", ")))
216 }
217
218 # Friendly pre-check for required API key depending on the chosen model/provider
219 if (isTRUE(check_api_key)) {
220 ensure_api_key_for_model(model)
221 }
222
223 # Resolve provider if not supplied
224 if (is.null(provider)) {
225 provider <- resolve_provider_for_model(model)
226 }
227
228 # Prepare inputs and iterate using purrr::pmap_dfr to avoid NSE warnings
229 input_cols <- tibble::tibble(
230 title = df[["title"]],
231 author = df[["author"]],
232 subTitle = df[["subTitle"]],
233 translator = df[["translator"]],
234 publisher = df[["publisher"]],
235 pubDate = df[["pubDate"]],
236 ISBN = df[["ISBN"]]
237 )
238
239 infer_wrapper <- function(title, author, subTitle, translator, publisher, pubDate, ISBN) {
240 res <- infer_original_single(
241 title = title,
242 author = author,
243 subTitle = subTitle,
244 translator = translator,
245 publisher = publisher,
246 pubDate = pubDate,
247 ISBN = ISBN,
248 provider = provider,
249 model = model,
250 temperature = temperature,
251 max_tries = max_tries,
252 timeout = timeout
253 )
254 if (pause_seconds > 0) Sys.sleep(pause_seconds)
255 res
256 }
257
258 n <- nrow(input_cols)
259 results_list <- vector("list", n)
260 empty_res <- tibble::tibble(
261 original_title = NA_character_,
262 original_publication_year = as.integer(NA),
263 orig_confidence = as.numeric(NA)
264 )
265 pb <- NULL
266 if (isTRUE(show_progress) && n > 0) {
267 pb <- utils::txtProgressBar(min = 0, max = n, style = progress_style)
268 }
269 for (i in seq_len(n)) {
270 res <- tryCatch(
271 infer_wrapper(
272 title = input_cols$title[i],
273 author = input_cols$author[i],
274 subTitle = input_cols$subTitle[i],
275 translator = input_cols$translator[i],
276 publisher = input_cols$publisher[i],
277 pubDate = input_cols$pubDate[i],
278 ISBN = input_cols$ISBN[i]
279 ),
280 error = function(e) empty_res
281 )
282 if (is.null(res) || !is.data.frame(res) || nrow(res) < 1) {
283 res <- empty_res
284 } else if (nrow(res) > 1) {
285 res <- res[1, , drop = FALSE]
286 }
287 results_list[[i]] <- res
288 if (!is.null(pb)) utils::setTxtProgressBar(pb, i)
289 }
290 if (!is.null(pb)) close(pb)
291 results_tbl <- dplyr::bind_rows(results_list)
292 if (nrow(results_tbl) != n) {
293 # Ensure row count aligns with df to avoid recycling errors
294 if (nrow(results_tbl) < n) {
295 pad <- dplyr::bind_rows(rep(list(empty_res), n - nrow(results_tbl)))
296 results_tbl <- dplyr::bind_rows(results_tbl, pad)
297 } else if (nrow(results_tbl) > n) {
298 results_tbl <- dplyr::slice(results_tbl, seq_len(n))
299 }
300 }
301
302 out <- dplyr::bind_cols(df, results_tbl)
303 if (!include_confidence) out <- dplyr::select(out, -"orig_confidence")
304 out
305}
306
307# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
308df_aug <- augment_metadata_with_original(df, model = "deepseek")
309
310# # OpenAI
311# df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini")
312
313# # Claude
314#df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest")
315
316# # Gemini
317# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")