Remove unrelated scripts
diff --git a/scripts/duckdb2tibble.R b/scripts/duckdb2tibble.R
deleted file mode 100755
index 490338c..0000000
--- a/scripts/duckdb2tibble.R
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env Rscript
-
-suppressPackageStartupMessages({
- library(DBI)
- library(duckdb)
- library(tibble)
-})
-
-# Public function: safe to source in RStudio
-# Returns a tibble from DuckDB using either a table name or a SQL query
-duckdb2tibble <- function(
- db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb",
- table = "deliko_metadata",
- sql = "",
- limit = 0,
- read_only = TRUE) {
- stopifnot(is.character(db), length(db) == 1)
- if (!file.exists(db)) stop("DuckDB not found: ", db)
-
- drv <- duckdb::duckdb()
- con <- DBI::dbConnect(drv, dbdir = db, read_only = isTRUE(read_only))
- on.exit(
- {
- try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE)
- try(duckdb::duckdb_shutdown(drv), silent = TRUE)
- },
- add = TRUE
- )
-
- if (nzchar(sql)) {
- res <- DBI::dbGetQuery(con, sql)
- return(tibble::as_tibble(res))
- }
-
- q_schema <- DBI::dbQuoteIdentifier(con, "main")
- q_table <- DBI::dbQuoteIdentifier(con, table)
- fq_name <- paste0(as.character(q_schema), ".", as.character(q_table))
- q <- paste0("SELECT * FROM ", fq_name)
- if (is.numeric(limit) && limit > 0) q <- paste0(q, " LIMIT ", as.integer(limit))
-
- tibble::as_tibble(DBI::dbGetQuery(con, q))
-}
-
-# CLI mode only: run when invoked via Rscript
-if (!interactive()) {
- defaults <- list(
- db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb",
- table = "deliko_metadata",
- sql = "",
- limit = "0",
- output = ""
- )
- opts <- defaults
- args <- commandArgs(trailingOnly = TRUE)
- for (a in args) {
- if (!grepl("=", a, fixed = TRUE)) next
- kv <- strsplit(a, "=", fixed = TRUE)[[1]]
- if (length(kv) == 2 && kv[1] %in% names(opts)) opts[[kv[1]]] <- kv[2]
- }
-
- # ENV overrides
- opts$db <- Sys.getenv("DELIKO_DUCKDB", opts$db)
- opts$table <- Sys.getenv("DELIKO_TABLE", opts$table)
- opts$sql <- Sys.getenv("DELIKO_SQL", opts$sql)
- opts$limit <- Sys.getenv("LIMIT", opts$limit)
- opts$output <- Sys.getenv("OUT_RDS", opts$output)
-
- message("Connecting DuckDB: ", opts$db)
- df <- duckdb2tibble(
- db = opts$db,
- table = opts$table,
- sql = opts$sql,
- limit = suppressWarnings(as.integer(opts$limit)),
- read_only = TRUE
- )
-
- message("Loaded tibble: ", nrow(df), " rows x ", ncol(df), " cols")
- message("Columns: ", paste(names(df), collapse = ", "))
- print(utils::head(df, n = if (nrow(df) > 10) 10 else nrow(df)))
-
- if (nzchar(opts$output)) {
- saveRDS(df, opts$output)
- message("Saved tibble to ", opts$output)
- }
-}
diff --git a/scripts/sample_origin_trace_duckdb.R b/scripts/sample_origin_trace_duckdb.R
deleted file mode 100755
index 7223036..0000000
--- a/scripts/sample_origin_trace_duckdb.R
+++ /dev/null
@@ -1,589 +0,0 @@
-#!/usr/bin/env Rscript
-
-suppressPackageStartupMessages({
- library(DBI)
- library(duckdb)
- library(dplyr)
- library(purrr)
- library(readr)
- library(stringr)
- library(tidyr)
- library(tidyllm)
-})
-# Defaults (override via CLI key=value or ENV)
-defaults <- list(
- db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb",
- table = "deliko_metadata",
- id_col = "ISBN", # stable identifier per record
- dc_col = "dnb_dc", # column with DC XML
- sample_size = "10", # total sample size (default)
- seed = "41", # RNG seed for reproducibility
- out = "sample_origin_trace.tsv", # output TSV path
- include_consensus = "true", # whether to compute consensus columns
- prompt_source = "xml", # text | xml
- show_prompts = "true" # whether to print prompts to console
-)
-
-# Parse key=value CLI args
-opts <- defaults
-args <- commandArgs(trailingOnly = TRUE)
-for (a in args) {
- if (!grepl("=", a, fixed = TRUE)) next
- kv <- strsplit(a, "=", fixed = TRUE)[[1]]
- if (length(kv) == 2 && kv[1] %in% names(opts)) opts[[kv[1]]] <- kv[2]
-}
-
-# ENV overrides
-opts$db <- Sys.getenv("DELIKO_DUCKDB", opts$db)
-opts$table <- Sys.getenv("DELIKO_TABLE", opts$table)
-opts$id_col <- Sys.getenv("DELIKO_ID_COL", opts$id_col)
-opts$dc_col <- Sys.getenv("DELIKO_DC_COL", opts$dc_col)
-opts$sample_size <- Sys.getenv("DELIKO_SAMPLE_SIZE", opts$sample_size)
-opts$seed <- Sys.getenv("DELIKO_SAMPLE_SEED", opts$seed)
-opts$out <- Sys.getenv("DELIKO_SAMPLE_OUT", opts$out)
-opts$include_consensus <- Sys.getenv("DELIKO_INCLUDE_CONSENSUS", opts$include_consensus)
-opts$prompt_source <- Sys.getenv("DELIKO_PROMPT_SOURCE", opts$prompt_source)
-opts$show_prompts <- Sys.getenv("DELIKO_SHOW_PROMPTS", opts$show_prompts)
-
-# Coerce and validate
-sample_size <- suppressWarnings(as.integer(opts$sample_size))
-if (is.na(sample_size) || sample_size <= 0) stop("Invalid sample_size: ", opts$sample_size)
-seed <- suppressWarnings(as.integer(opts$seed))
-if (is.na(seed)) seed <- 42L
-include_consensus <- tolower(opts$include_consensus) %in% c("true", "1", "yes", "y")
-prompt_source <- tolower(opts$prompt_source)
-if (!(prompt_source %in% c("text", "xml"))) prompt_source <- "text"
-message("Prompt source: ", prompt_source)
-show_prompts <- tolower(opts$show_prompts) %in% c("true", "1", "yes", "y")
-
-if (!file.exists(opts$db)) stop("DuckDB not found: ", opts$db)
-
-# Use helper to access DuckDB robustly without managing connection lifecycle here
-duck_env <- new.env(parent = baseenv())
-sys.source("/home/kupietz/korap4dnb/epub2i5/scripts/duckdb2tibble.R", envir = duck_env)
-duckdb2tibble <- get("duckdb2tibble", envir = duck_env)
-
-# Resolve effective column names by reading zero rows
-message("Resolving columns from table: ", opts$table)
-df0 <- duckdb2tibble(
- db = opts$db,
- table = opts$table,
- limit = 0
-)
-if (!is.data.frame(df0)) stop("Failed to read schema from table: ", opts$table)
-cols <- names(df0)
-message(sprintf("Loaded schema columns: %s", paste(cols, collapse = ", ")))
-# safe match that returns NA_character_ if not found
-safe_match_name <- function(name, pool) {
- idx <- match(tolower(name), tolower(pool))
- if (is.na(idx)) {
- return(NA_character_)
- }
- pool[[idx]]
-}
-id_col_eff <- safe_match_name(opts$id_col, cols)
-dc_col_eff <- safe_match_name(opts$dc_col, cols)
-if (is.na(id_col_eff)) stop("ID column not found: ", opts$id_col)
-if (is.na(dc_col_eff)) stop("DC column not found: ", opts$dc_col)
-title_eff <- cols[match(tolower("title"), tolower(cols))]
-author_eff <- cols[match(tolower("author"), tolower(cols))]
-subTitle_eff <- cols[match(tolower("subTitle"), tolower(cols))]
-translator_eff <- cols[match(tolower("translator"), tolower(cols))]
-publisher_eff <- cols[match(tolower("publisher"), tolower(cols))]
-pubDate_eff <- cols[match(tolower("pubDate"), tolower(cols))]
-isbn_eff <- cols[match(tolower("ISBN"), tolower(cols))]
-
-# Build SQL with explicit quoting to handle case-sensitive identifiers
-sel <- c(
- sprintf('"%s" AS id', id_col_eff),
- sprintf('"%s" AS dnb_dc', dc_col_eff)
-)
-if (!is.na(title_eff)) sel <- c(sel, sprintf('"%s" AS title', title_eff))
-if (!is.na(pubDate_eff)) sel <- c(sel, sprintf('"%s" AS pubDate', pubDate_eff))
-if (prompt_source == "text") {
- if (!is.na(author_eff)) sel <- c(sel, sprintf('"%s" AS author', author_eff))
- if (!is.na(subTitle_eff)) sel <- c(sel, sprintf('"%s" AS subTitle', subTitle_eff))
- if (!is.na(translator_eff)) sel <- c(sel, sprintf('"%s" AS translator', translator_eff))
- if (!is.na(publisher_eff)) sel <- c(sel, sprintf('"%s" AS publisher', publisher_eff))
- if (!is.na(isbn_eff)) sel <- c(sel, sprintf('"%s" AS ISBN', isbn_eff))
-}
-sql <- sprintf(
- 'SELECT %s FROM "main"."%s" WHERE "%s" IS NOT NULL AND length(trim("%s")) > 0',
- paste(sel, collapse = ", "), opts$table, dc_col_eff, dc_col_eff
-)
-
-# Fetch minimal data needed for stratification
-df <- duckdb2tibble(db = opts$db, sql = sql)
-if (nrow(df) == 0) stop("No rows with non-empty ", dc_col_eff)
-
-# Stratify by presence of the German term "Übersetzer" in the dnb_dc payload
-df <- df %>% mutate(
- stratum = if_else(str_detect(dnb_dc, fixed("Übersetzer", ignore_case = TRUE)), "with_translator", "without_translator")
-)
-
-# Compute per-stratum sizes (balanced rounding) under total sample_size
-# Helper for infix default used in size allocation
-`%||%` <- function(x, y) if (length(x) == 0 || is.na(x)) y else x
-set.seed(seed)
-tab <- df %>% count(stratum, name = "N")
-if (nrow(tab) == 1) {
- n_with <- if (tab$stratum[1] == "with_translator") sample_size else 0L
- n_without <- if (tab$stratum[1] == "without_translator") sample_size else 0L
-} else {
- p_with <- tab$N[tab$stratum == "with_translator"] / nrow(df)
- if (is.na(p_with)) p_with <- 0
- n_with <- as.integer(round(sample_size * p_with))
- n_with <- min(n_with, tab$N[tab$stratum == "with_translator"] %||% 0L)
- n_without <- sample_size - n_with
- max_without <- tab$N[tab$stratum == "without_translator"] %||% 0L
- if (n_without > max_without) {
- n_without <- max_without
- # move leftover to with_translator if available
- rem <- sample_size - (n_with + n_without)
- max_with <- tab$N[tab$stratum == "with_translator"] %||% 0L
- n_with <- min(n_with + max(rem, 0L), max_with)
- }
-}
-
-# Compute subgroup data frames and sample sizes as constants
-with_df <- df %>% filter(stratum == "with_translator")
-without_df <- df %>% filter(stratum == "without_translator")
-want_with <- floor(sample_size / 2)
-want_without <- sample_size - want_with
-avail_with <- nrow(with_df)
-avail_without <- nrow(without_df)
-take_with <- min(want_with, avail_with)
-take_without <- min(want_without, avail_without)
-rem <- sample_size - (take_with + take_without)
-if (rem > 0) {
- left_with <- max(avail_with - take_with, 0L)
- add_from_with <- min(rem, left_with)
- take_with <- take_with + add_from_with
- rem <- rem - add_from_with
- if (rem > 0) {
- left_without <- max(avail_without - take_without, 0L)
- add_from_without <- min(rem, left_without)
- take_without <- take_without + add_from_without
- rem <- rem - add_from_without
- }
-}
-with_n <- take_with
-without_n <- take_without
-message(sprintf("Stratified target with=%d without=%d; available with=%d without=%d; chosen with=%d without=%d", want_with, want_without, avail_with, avail_without, with_n, without_n))
-
-samp_with <- dplyr::slice_sample(with_df, n = with_n, replace = FALSE)
-samp_without <- dplyr::slice_sample(without_df, n = without_n, replace = FALSE)
-sampled <- bind_rows(samp_with, samp_without) %>% arrange(stratum, id)
-
-message(sprintf("Sampled %d rows (with_translator=%d, without_translator=%d)", nrow(sampled), nrow(samp_with), nrow(samp_without)))
-
-# Build English prompt directly from dnb_dc snippet
-build_prompt_from_dc <- function(dnb_dc_xml) {
- # Truncate to keep prompts manageable
- snippet <- dnb_dc_xml
- max_chars <- 4000L
- if (!is.na(snippet) && nchar(snippet, type = "chars") > max_chars) {
- snippet <- substr(snippet, 1L, max_chars)
- }
- paste0(
- "You are a bibliographic assistant. From the following OAI-DC metadata of a German National Library record, determine: \n",
- "1) Whether the book is a translation.\n",
- "2) If it is a translation: the translator's full name, the original title, and the original publication year.\n",
- "3) If it is not a translation but seems to be a later edition (e.g., 2nd edition), try to infer the original publication year of the first edition.\n\n",
- "Return a single JSON object only — no prose — with exactly these keys: \n",
- "{\n \"is_translation\": <boolean>,\n \"translator_name\": <string|null>,\n \"original_title\": <string|null>,\n \"original_publication_year\": <integer|null>,\n \"origin_confidence\": <number between 0 and 1>\n}\n",
- "Use null for unknown values. Output JSON only.\n\n",
- "OAI-DC metadata (XML):\n",
- snippet
- )
-}
-
-# Build English prompt from human-readable fields
-build_prompt_from_fields <- function(title = NA_character_, author = NA_character_, subTitle = NA_character_, translator = NA_character_, publisher = NA_character_, pubDate = NA_character_, ISBN = NA_character_) {
- to_chr <- function(x) {
- if (is.null(x) || length(x) == 0) {
- return("")
- }
- y <- as.character(x)
- if (length(y) == 0) {
- return("")
- }
- y[is.na(y)] <- ""
- y
- }
- title <- to_chr(title)
- subTitle <- to_chr(subTitle)
- author <- to_chr(author)
- translator <- to_chr(translator)
- publisher <- to_chr(publisher)
- pubDate <- to_chr(pubDate)
- ISBN <- to_chr(ISBN)
- lines <- c(
- if (nzchar(title)) sprintf("Title: %s", title) else NULL,
- if (nzchar(subTitle)) sprintf("Subtitle: %s", subTitle) else NULL,
- if (nzchar(author)) sprintf("Author: %s", author) else NULL,
- if (nzchar(translator)) sprintf("Translator: %s", translator) else NULL,
- if (nzchar(publisher)) sprintf("Publisher: %s", publisher) else NULL,
- if (nzchar(pubDate)) sprintf("Publication year (German edition): %s", pubDate) else NULL,
- if (nzchar(ISBN)) sprintf("ISBN: %s", ISBN) else NULL
- )
- context <- paste(lines, collapse = "\n")
- paste0(
- "You are a bibliographic assistant. For the following German-language book, determine:\n",
- "1) Whether it is a translation.\n",
- "2) If yes: the original title and the year of first publication.\n\n",
- "Return a single JSON object only — no prose — with exactly these keys:\n",
- "{\n \"is_translation\": <boolean>,\n \"translator_name\": <string|null>,\n \"original_title\": <string|null>,\n \"original_publication_year\": <integer|null>,\n \"origin_confidence\": <number between 0 and 1>\n}\n",
- "Use null for unknown. Output JSON only.\n\n",
- "Metadata:\n",
- context
- )
-}
-
-# Models used for inference (replace Gemini with an Anthropic model)
-models <- c("claude-3-haiku-20240307", "gpt-4o-mini", "deepseek-chat")
-
-# Helper to build an empty one-row result with all model-prefixed columns
-empty_model_row <- function() {
- vals <- list()
- for (m in models) {
- vals[[paste0(m, "_translator_name")]] <- NA_character_
- vals[[paste0(m, "_original_title")]] <- NA_character_
- vals[[paste0(m, "_original_publication_year")]] <- as.integer(NA)
- vals[[paste0(m, "_origin_confidence")]] <- as.numeric(NA)
- }
- tibble::as_tibble(vals)
-}
-
-# Map model prefix to provider
-resolve_provider_for_model <- function(model) {
- if (grepl("^gpt-", model, ignore.case = TRUE)) {
- return(tidyllm::openai())
- } else if (grepl("^claude-", model, ignore.case = TRUE)) {
- return(tidyllm::claude())
- } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
- return(tidyllm::gemini())
- } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
- return(tidyllm::deepseek())
- }
- tidyllm::deepseek()
-}
-
-ensure_api_key_for_model <- function(model) {
- if (grepl("^gpt-", model, ignore.case = TRUE)) {
- if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) stop("OPENAI_API_KEY is not set")
- } else if (grepl("^claude-", model, ignore.case = TRUE)) {
- if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) stop("ANTHROPIC_API_KEY is not set")
- } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
- if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) stop("GOOGLE_API_KEY is not set")
- } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
- if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) stop("DEEPSEEK_API_KEY is not set")
- }
- invisible(TRUE)
-}
-
-inference_for_prompt <- function(prompt_text, model, temperature = 0.1, timeout = 60, max_tries = 3) {
- ensure_api_key_for_model(model)
- provider <- resolve_provider_for_model(model)
- prompt <- prompt_text
-
- res <- NULL
- err <- NULL
- tryCatch(
- {
- res <- tidyllm::llm_message(prompt) |>
- tidyllm::chat(
- .provider = provider,
- .model = model,
- .temperature = temperature,
- .timeout = timeout,
- .max_tries = max_tries
- )
- },
- error = function(e) err <<- conditionMessage(e)
- )
-
- if (!is.null(err)) {
- message(sprintf(
- "LLM request failed (model=%s): %s",
- model, err
- ))
- return(tibble::tibble(
- translator_name = NA_character_,
- original_title = NA_character_,
- original_publication_year = as.integer(NA),
- origin_confidence = as.numeric(NA)
- ))
- }
-
- # Prefer JSON parsing via tidyllm helper; falls back to NA on failure
- parsed <- NULL
- suppressWarnings({
- parsed <- tidyllm::get_reply_data(res)
- })
-
- if (is.null(parsed)) {
- # Try to extract JSON manually as a very last resort
- raw <- tidyllm::get_reply(res)
- json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}")
- if (!is.na(json_txt)) {
- parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE)
- if (inherits(parsed, "try-error")) parsed <- NULL
- }
- if (is.null(parsed) && tolower(Sys.getenv("DELIKO_DEBUG", "")) %in% c("1", "true", "yes", "y")) {
- snippet <- substr(raw, 1L, 500L)
- message(sprintf("Parse failed; raw reply snippet (model=%s): %s", model, gsub("\n", " ", snippet)))
- }
- }
-
- if (is.null(parsed)) {
- return(tibble::tibble(
- translator_name = NA_character_,
- original_title = NA_character_,
- original_publication_year = as.integer(NA),
- origin_confidence = as.numeric(NA)
- ))
- }
-
- # parsed may be a data.frame/tibble or a named list
- get_val <- function(obj, keys) {
- for (k in keys) {
- if (is.data.frame(obj) && k %in% names(obj)) {
- return(obj[[k]][[1]])
- }
- if (is.list(obj) && !is.null(obj[[k]])) {
- return(obj[[k]])
- }
- }
- return(NA)
- }
-
- tr <- get_val(parsed, c("translator_name", "translator"))
- ot <- get_val(parsed, c("original_title", "originalTitle", "title_original"))
- opd <- suppressWarnings(as.integer(get_val(parsed, c("original_publication_year", "original_publication_date", "year", "year_of_first_publication"))))
- cf <- suppressWarnings(as.numeric(get_val(parsed, c("origin_confidence", "confidence", "confidence_score"))))
-
- if (!is.na(cf)) cf <- max(0, min(1, cf))
-
- # Debug which keys were present
- if (tolower(Sys.getenv("DELIKO_DEBUG", "")) %in% c("1", "true", "yes", "y")) {
- present <- intersect(c("translator_name", "translator", "original_title", "originalTitle", "title_original", "original_publication_year", "original_publication_year", "year", "year_of_first_publication", "origin_confidence", "confidence", "confidence_score"), names(parsed))
- message(sprintf("Parsed keys present (model=%s): %s", model, paste(present, collapse = ", ")))
- }
-
- tibble::tibble(
- translator_name = ifelse(is.null(tr) || is.na(tr) || !nzchar(as.character(tr)), NA_character_, as.character(tr)),
- original_title = ifelse(is.null(ot) || is.na(ot) || !nzchar(as.character(ot)), NA_character_, as.character(ot)),
- original_publication_year = ifelse(is.na(opd), as.integer(NA), as.integer(opd)),
- origin_confidence = ifelse(is.na(cf), as.numeric(NA), as.numeric(cf))
- )
-}
-
-# Run LLMs in parallel for each row if possible
-# We use parallelization at the per-row level using future.apply if available; otherwise sequential.
-has_future <- (requireNamespace("future.apply", quietly = TRUE) && requireNamespace("future", quietly = TRUE))
-if (has_future) {
- future::plan(future::multisession, workers = min(length(models), 3L))
- on.exit(
- {
- try(future::plan(future::sequential), silent = TRUE)
- },
- add = TRUE
- )
-}
-
-augment_row_with_models <- function(dnb_dc_xml) {
- # Execute models in parallel if future.apply is available
- run_model <- function(model) {
- err_msg <- NULL
- t0 <- Sys.time()
- res <- tryCatch(
- inference_for_prompt(dnb_dc_xml, model),
- error = function(e) {
- err_msg <<- conditionMessage(e)
- tibble::tibble(
- translator_name = NA_character_,
- original_title = NA_character_,
- original_publication_year = as.integer(NA),
- origin_confidence = as.numeric(NA)
- )
- }
- )
- dt <- as.numeric(difftime(Sys.time(), t0, units = "secs"))
- if (is.null(err_msg)) {
- message(sprintf("LLM timing: model=%s, elapsed=%.2fs", model, dt))
- } else {
- message(sprintf("LLM timing: model=%s, elapsed=%.2fs, error=%s", model, dt, err_msg))
- }
- res
- }
- results <- if (has_future) {
- future.apply::future_lapply(models, run_model)
- } else {
- lapply(models, run_model)
- }
- names(results) <- models
-
- # Prefix columns with model name
- out_list <- lapply(names(results), function(m) {
- res <- results[[m]]
- names(res) <- paste0(m, "_", names(res))
- res
- })
- out <- tryCatch(dplyr::bind_cols(out_list), error = function(e) {
- message(sprintf("bind_cols failed for model outputs: %s", conditionMessage(e)))
- NULL
- })
- if (is.null(out)) out <- empty_model_row()
- out
-}
-
-message(sprintf("Running models: %s; parallel=%s", paste(models, collapse = ", "), has_future))
-getv <- function(df, i, nm) if (nm %in% names(df)) df[[nm]][i] else NA_character_
-if (prompt_source == "text") {
- prompts <- vapply(seq_len(nrow(sampled)), function(i) {
- build_prompt_from_fields(
- title = getv(sampled, i, "title"),
- author = getv(sampled, i, "author"),
- subTitle = getv(sampled, i, "subTitle"),
- translator = getv(sampled, i, "translator"),
- publisher = getv(sampled, i, "publisher"),
- pubDate = getv(sampled, i, "pubDate"),
- ISBN = getv(sampled, i, "ISBN")
- )
- }, character(1))
-} else {
- prompts <- vapply(seq_len(nrow(sampled)), function(i) build_prompt_from_dc(sampled$dnb_dc[i]), character(1))
-}
-sampled$prompt_text <- prompts
-
-if (isTRUE(show_prompts)) {
- message(sprintf("Printing %d prompts:", length(prompts)))
- for (i in seq_along(prompts)) {
- pid <- if ("id" %in% names(sampled)) as.character(sampled$id[i]) else "<no id>"
- str <- if ("stratum" %in% names(sampled)) as.character(sampled$stratum[i]) else "<no stratum>"
- message(sprintf("Prompt [%d/%d] id=%s stratum=%s:\n%s", i, length(prompts), pid, str, prompts[i]))
- }
-}
-
-model_cols <- sampled %>%
- mutate(.row = row_number()) %>%
- split(.$.row) %>%
- purrr::map(~ augment_row_with_models(.x$prompt_text[[1]])) %>%
- dplyr::bind_rows()
-
-message(sprintf("sampled rows=%d, model_cols rows=%d, model_cols cols=%d", nrow(sampled), nrow(model_cols), ncol(model_cols)))
-
-# Ensure model_cols has same number of rows as sampled
-if (nrow(model_cols) != nrow(sampled)) {
- message(sprintf("Adjusting model_cols rows from %d to %d", nrow(model_cols), nrow(sampled)))
- if (nrow(model_cols) < nrow(sampled)) {
- empty_row <- empty_model_row()
- pad_list <- rep(list(empty_row), nrow(sampled) - nrow(model_cols))
- pad <- dplyr::bind_rows(pad_list)
- model_cols <- dplyr::bind_rows(model_cols, pad)
- } else if (nrow(model_cols) > nrow(sampled)) {
- model_cols <- dplyr::slice(model_cols, seq_len(nrow(sampled)))
- }
-}
-
-# Add German title and publication year to the output
-title_de <- if ("title" %in% names(sampled)) as.character(sampled$title) else rep(NA_character_, nrow(sampled))
-pub_year_de <- if ("pubDate" %in% names(sampled)) suppressWarnings(as.integer(stringr::str_extract(as.character(sampled$pubDate), "\\\\d{4}"))) else rep(as.integer(NA), nrow(sampled))
-meta_cols <- tibble::tibble(title_de = title_de, pub_year_de = pub_year_de)
-out <- dplyr::bind_cols(sampled %>% select(id, stratum), meta_cols, model_cols)
-
-# Optional consensus columns (majority vote for title/name string equality; median for year/confidence)
-if (isTRUE(include_consensus)) {
- # Helper to majority vote on strings (ignoring case/punctuation)
- norm <- function(x) {
- x <- tolower(trimws(x))
- x <- gsub("[[:punct:]]+", "", x)
- x[nzchar(x)]
- }
- # For translator names: keep punctuation, only trim and lowercase
- norm_keep_punct <- function(x) {
- x <- tolower(trimws(x))
- x[nzchar(x)]
- }
- get_mode <- function(vals) {
- vals <- vals[!is.na(vals) & nzchar(vals)]
- if (length(vals) == 0) {
- return(NA_character_)
- }
- tbl <- sort(table(vals), decreasing = TRUE)
- names(tbl)[1]
- }
- get_median_int <- function(vals) {
- vals <- vals[!is.na(vals)]
- if (length(vals) == 0) {
- return(as.integer(NA))
- }
- as.integer(median(vals))
- }
- get_mean_num <- function(vals) {
- vals <- vals[!is.na(vals)]
- if (length(vals) == 0) {
- return(as.numeric(NA))
- }
- as.numeric(mean(vals))
- }
-
- out <- out %>%
- rowwise() %>%
- mutate(
- consensus_translator_name = {
- vals <- as.character(unlist(c_across(ends_with("_translator_name"))))
- vals <- norm_keep_punct(vals)
- if (length(vals) == 0) NA_character_ else get_mode(vals)
- },
- consensus_original_title = {
- vals <- as.character(unlist(c_across(ends_with("_original_title"))))
- vals <- norm(vals)
- if (length(vals) == 0) NA_character_ else get_mode(vals)
- },
- consensus_original_publication_year = get_median_int(as.integer(unlist(c_across(ends_with("_original_publication_year"))))),
- consensus_origin_confidence = get_mean_num(as.numeric(unlist(c_across(ends_with("_origin_confidence")))))
- ) %>%
- ungroup()
-}
-
-# Title-case consensus fields
-out <- out %>% mutate(
- consensus_translator_name = ifelse(is.na(consensus_translator_name), NA_character_, stringr::str_to_title(consensus_translator_name)),
- consensus_original_title = ifelse(is.na(consensus_original_title), NA_character_, stringr::str_to_title(consensus_original_title))
-)
-
-readr::write_tsv(out, opts$out)
-message("Wrote ", nrow(out), " rows to ", opts$out)
-
-# Inter-annotator agreement for original_publication_year
-year_cols <- names(out)[endsWith(names(out), "_original_publication_year")]
-if (length(year_cols) >= 2) {
- years_mat <- as.matrix(out[year_cols])
- # Pairwise exact match rate
- pairs <- combn(seq_along(year_cols), 2, simplify = FALSE)
- if (length(pairs) > 0) {
- msg <- vapply(pairs, function(p) {
- a <- years_mat[, p[1]]
- b <- years_mat[, p[2]]
- agree <- mean(a == b, na.rm = TRUE)
- sprintf("%s vs %s: %.2f", year_cols[p[1]], year_cols[p[2]], agree)
- }, character(1))
- message("Inter-annotator exact agreement (year): ")
- for (m in msg) message(" ", m)
- }
- # Fleiss' kappa if irr is available
- if (requireNamespace("irr", quietly = TRUE)) {
- # Convert to factors per row; irr::kappam.fleiss expects raters in columns
- # Handle NAs by treating them as a separate level
- yrs_df <- as.data.frame(years_mat)
- for (j in seq_along(yrs_df)) yrs_df[[j]] <- as.factor(ifelse(is.na(yrs_df[[j]]), "NA", as.character(yrs_df[[j]])))
- suppressWarnings({
- kappa_res <- try(irr::kappam.fleiss(yrs_df), silent = TRUE)
- })
- if (!inherits(kappa_res, "try-error")) {
- message(sprintf("Fleiss' kappa (year): %.3f", kappa_res$value))
- }
- }
-}