Remove unrelated scripts

commit: af020463c1d0471227775b74b63ba8a8fa5ab9b8 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat Oct 04 15:24:44 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sat Oct 04 15:25:08 2025 +0200
tree: 1f8c12ba2e8c9f1af3e39e7509b68f658069e0c6
parent: 875623e2e5735dab5b3a88336a7c3d2709121f12 [diff]
diff --git a/scripts/duckdb2tibble.R b/scripts/duckdb2tibble.R
deleted file mode 100755
index 490338c..0000000
--- a/scripts/duckdb2tibble.R
+++ /dev/null

@@ -1,85 +0,0 @@
-#!/usr/bin/env Rscript
-
-suppressPackageStartupMessages({
-  library(DBI)
-  library(duckdb)
-  library(tibble)
-})
-
-# Public function: safe to source in RStudio
-# Returns a tibble from DuckDB using either a table name or a SQL query
-duckdb2tibble <- function(
-    db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb",
-    table = "deliko_metadata",
-    sql = "",
-    limit = 0,
-    read_only = TRUE) {
-  stopifnot(is.character(db), length(db) == 1)
-  if (!file.exists(db)) stop("DuckDB not found: ", db)
-
-  drv <- duckdb::duckdb()
-  con <- DBI::dbConnect(drv, dbdir = db, read_only = isTRUE(read_only))
-  on.exit(
-    {
-      try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE)
-      try(duckdb::duckdb_shutdown(drv), silent = TRUE)
-    },
-    add = TRUE
-  )
-
-  if (nzchar(sql)) {
-    res <- DBI::dbGetQuery(con, sql)
-    return(tibble::as_tibble(res))
-  }
-
-  q_schema <- DBI::dbQuoteIdentifier(con, "main")
-  q_table <- DBI::dbQuoteIdentifier(con, table)
-  fq_name <- paste0(as.character(q_schema), ".", as.character(q_table))
-  q <- paste0("SELECT * FROM ", fq_name)
-  if (is.numeric(limit) && limit > 0) q <- paste0(q, " LIMIT ", as.integer(limit))
-
-  tibble::as_tibble(DBI::dbGetQuery(con, q))
-}
-
-# CLI mode only: run when invoked via Rscript
-if (!interactive()) {
-  defaults <- list(
-    db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb",
-    table = "deliko_metadata",
-    sql = "",
-    limit = "0",
-    output = ""
-  )
-  opts <- defaults
-  args <- commandArgs(trailingOnly = TRUE)
-  for (a in args) {
-    if (!grepl("=", a, fixed = TRUE)) next
-    kv <- strsplit(a, "=", fixed = TRUE)[[1]]
-    if (length(kv) == 2 && kv[1] %in% names(opts)) opts[[kv[1]]] <- kv[2]
-  }
-
-  # ENV overrides
-  opts$db <- Sys.getenv("DELIKO_DUCKDB", opts$db)
-  opts$table <- Sys.getenv("DELIKO_TABLE", opts$table)
-  opts$sql <- Sys.getenv("DELIKO_SQL", opts$sql)
-  opts$limit <- Sys.getenv("LIMIT", opts$limit)
-  opts$output <- Sys.getenv("OUT_RDS", opts$output)
-
-  message("Connecting DuckDB: ", opts$db)
-  df <- duckdb2tibble(
-    db = opts$db,
-    table = opts$table,
-    sql = opts$sql,
-    limit = suppressWarnings(as.integer(opts$limit)),
-    read_only = TRUE
-  )
-
-  message("Loaded tibble: ", nrow(df), " rows x ", ncol(df), " cols")
-  message("Columns: ", paste(names(df), collapse = ", "))
-  print(utils::head(df, n = if (nrow(df) > 10) 10 else nrow(df)))
-
-  if (nzchar(opts$output)) {
-    saveRDS(df, opts$output)
-    message("Saved tibble to ", opts$output)
-  }
-}

diff --git a/scripts/sample_origin_trace_duckdb.R b/scripts/sample_origin_trace_duckdb.R
deleted file mode 100755
index 7223036..0000000
--- a/scripts/sample_origin_trace_duckdb.R
+++ /dev/null

@@ -1,589 +0,0 @@
-#!/usr/bin/env Rscript
-
-suppressPackageStartupMessages({
-  library(DBI)
-  library(duckdb)
-  library(dplyr)
-  library(purrr)
-  library(readr)
-  library(stringr)
-  library(tidyr)
-  library(tidyllm)
-})
-# Defaults (override via CLI key=value or ENV)
-defaults <- list(
-  db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb",
-  table = "deliko_metadata",
-  id_col = "ISBN", # stable identifier per record
-  dc_col = "dnb_dc", # column with DC XML
-  sample_size = "10", # total sample size (default)
-  seed = "41", # RNG seed for reproducibility
-  out = "sample_origin_trace.tsv", # output TSV path
-  include_consensus = "true", # whether to compute consensus columns
-  prompt_source = "xml", # text | xml
-  show_prompts = "true" # whether to print prompts to console
-)
-
-# Parse key=value CLI args
-opts <- defaults
-args <- commandArgs(trailingOnly = TRUE)
-for (a in args) {
-  if (!grepl("=", a, fixed = TRUE)) next
-  kv <- strsplit(a, "=", fixed = TRUE)[[1]]
-  if (length(kv) == 2 && kv[1] %in% names(opts)) opts[[kv[1]]] <- kv[2]
-}
-
-# ENV overrides
-opts$db <- Sys.getenv("DELIKO_DUCKDB", opts$db)
-opts$table <- Sys.getenv("DELIKO_TABLE", opts$table)
-opts$id_col <- Sys.getenv("DELIKO_ID_COL", opts$id_col)
-opts$dc_col <- Sys.getenv("DELIKO_DC_COL", opts$dc_col)
-opts$sample_size <- Sys.getenv("DELIKO_SAMPLE_SIZE", opts$sample_size)
-opts$seed <- Sys.getenv("DELIKO_SAMPLE_SEED", opts$seed)
-opts$out <- Sys.getenv("DELIKO_SAMPLE_OUT", opts$out)
-opts$include_consensus <- Sys.getenv("DELIKO_INCLUDE_CONSENSUS", opts$include_consensus)
-opts$prompt_source <- Sys.getenv("DELIKO_PROMPT_SOURCE", opts$prompt_source)
-opts$show_prompts <- Sys.getenv("DELIKO_SHOW_PROMPTS", opts$show_prompts)
-
-# Coerce and validate
-sample_size <- suppressWarnings(as.integer(opts$sample_size))
-if (is.na(sample_size) || sample_size <= 0) stop("Invalid sample_size: ", opts$sample_size)
-seed <- suppressWarnings(as.integer(opts$seed))
-if (is.na(seed)) seed <- 42L
-include_consensus <- tolower(opts$include_consensus) %in% c("true", "1", "yes", "y")
-prompt_source <- tolower(opts$prompt_source)
-if (!(prompt_source %in% c("text", "xml"))) prompt_source <- "text"
-message("Prompt source: ", prompt_source)
-show_prompts <- tolower(opts$show_prompts) %in% c("true", "1", "yes", "y")
-
-if (!file.exists(opts$db)) stop("DuckDB not found: ", opts$db)
-
-# Use helper to access DuckDB robustly without managing connection lifecycle here
-duck_env <- new.env(parent = baseenv())
-sys.source("/home/kupietz/korap4dnb/epub2i5/scripts/duckdb2tibble.R", envir = duck_env)
-duckdb2tibble <- get("duckdb2tibble", envir = duck_env)
-
-# Resolve effective column names by reading zero rows
-message("Resolving columns from table: ", opts$table)
-df0 <- duckdb2tibble(
-  db = opts$db,
-  table = opts$table,
-  limit = 0
-)
-if (!is.data.frame(df0)) stop("Failed to read schema from table: ", opts$table)
-cols <- names(df0)
-message(sprintf("Loaded schema columns: %s", paste(cols, collapse = ", ")))
-# safe match that returns NA_character_ if not found
-safe_match_name <- function(name, pool) {
-  idx <- match(tolower(name), tolower(pool))
-  if (is.na(idx)) {
-    return(NA_character_)
-  }
-  pool[[idx]]
-}
-id_col_eff <- safe_match_name(opts$id_col, cols)
-dc_col_eff <- safe_match_name(opts$dc_col, cols)
-if (is.na(id_col_eff)) stop("ID column not found: ", opts$id_col)
-if (is.na(dc_col_eff)) stop("DC column not found: ", opts$dc_col)
-title_eff <- cols[match(tolower("title"), tolower(cols))]
-author_eff <- cols[match(tolower("author"), tolower(cols))]
-subTitle_eff <- cols[match(tolower("subTitle"), tolower(cols))]
-translator_eff <- cols[match(tolower("translator"), tolower(cols))]
-publisher_eff <- cols[match(tolower("publisher"), tolower(cols))]
-pubDate_eff <- cols[match(tolower("pubDate"), tolower(cols))]
-isbn_eff <- cols[match(tolower("ISBN"), tolower(cols))]
-
-# Build SQL with explicit quoting to handle case-sensitive identifiers
-sel <- c(
-  sprintf('"%s" AS id', id_col_eff),
-  sprintf('"%s" AS dnb_dc', dc_col_eff)
-)
-if (!is.na(title_eff)) sel <- c(sel, sprintf('"%s" AS title', title_eff))
-if (!is.na(pubDate_eff)) sel <- c(sel, sprintf('"%s" AS pubDate', pubDate_eff))
-if (prompt_source == "text") {
-  if (!is.na(author_eff)) sel <- c(sel, sprintf('"%s" AS author', author_eff))
-  if (!is.na(subTitle_eff)) sel <- c(sel, sprintf('"%s" AS subTitle', subTitle_eff))
-  if (!is.na(translator_eff)) sel <- c(sel, sprintf('"%s" AS translator', translator_eff))
-  if (!is.na(publisher_eff)) sel <- c(sel, sprintf('"%s" AS publisher', publisher_eff))
-  if (!is.na(isbn_eff)) sel <- c(sel, sprintf('"%s" AS ISBN', isbn_eff))
-}
-sql <- sprintf(
-  'SELECT %s FROM "main"."%s" WHERE "%s" IS NOT NULL AND length(trim("%s")) > 0',
-  paste(sel, collapse = ", "), opts$table, dc_col_eff, dc_col_eff
-)
-
-# Fetch minimal data needed for stratification
-df <- duckdb2tibble(db = opts$db, sql = sql)
-if (nrow(df) == 0) stop("No rows with non-empty ", dc_col_eff)
-
-# Stratify by presence of the German term "Übersetzer" in the dnb_dc payload
-df <- df %>% mutate(
-  stratum = if_else(str_detect(dnb_dc, fixed("Übersetzer", ignore_case = TRUE)), "with_translator", "without_translator")
-)
-
-# Compute per-stratum sizes (balanced rounding) under total sample_size
-# Helper for infix default used in size allocation
-`%||%` <- function(x, y) if (length(x) == 0 || is.na(x)) y else x
-set.seed(seed)
-tab <- df %>% count(stratum, name = "N")
-if (nrow(tab) == 1) {
-  n_with <- if (tab$stratum[1] == "with_translator") sample_size else 0L
-  n_without <- if (tab$stratum[1] == "without_translator") sample_size else 0L
-} else {
-  p_with <- tab$N[tab$stratum == "with_translator"] / nrow(df)
-  if (is.na(p_with)) p_with <- 0
-  n_with <- as.integer(round(sample_size * p_with))
-  n_with <- min(n_with, tab$N[tab$stratum == "with_translator"] %||% 0L)
-  n_without <- sample_size - n_with
-  max_without <- tab$N[tab$stratum == "without_translator"] %||% 0L
-  if (n_without > max_without) {
-    n_without <- max_without
-    # move leftover to with_translator if available
-    rem <- sample_size - (n_with + n_without)
-    max_with <- tab$N[tab$stratum == "with_translator"] %||% 0L
-    n_with <- min(n_with + max(rem, 0L), max_with)
-  }
-}
-
-# Compute subgroup data frames and sample sizes as constants
-with_df <- df %>% filter(stratum == "with_translator")
-without_df <- df %>% filter(stratum == "without_translator")
-want_with <- floor(sample_size / 2)
-want_without <- sample_size - want_with
-avail_with <- nrow(with_df)
-avail_without <- nrow(without_df)
-take_with <- min(want_with, avail_with)
-take_without <- min(want_without, avail_without)
-rem <- sample_size - (take_with + take_without)
-if (rem > 0) {
-  left_with <- max(avail_with - take_with, 0L)
-  add_from_with <- min(rem, left_with)
-  take_with <- take_with + add_from_with
-  rem <- rem - add_from_with
-  if (rem > 0) {
-    left_without <- max(avail_without - take_without, 0L)
-    add_from_without <- min(rem, left_without)
-    take_without <- take_without + add_from_without
-    rem <- rem - add_from_without
-  }
-}
-with_n <- take_with
-without_n <- take_without
-message(sprintf("Stratified target with=%d without=%d; available with=%d without=%d; chosen with=%d without=%d", want_with, want_without, avail_with, avail_without, with_n, without_n))
-
-samp_with <- dplyr::slice_sample(with_df, n = with_n, replace = FALSE)
-samp_without <- dplyr::slice_sample(without_df, n = without_n, replace = FALSE)
-sampled <- bind_rows(samp_with, samp_without) %>% arrange(stratum, id)
-
-message(sprintf("Sampled %d rows (with_translator=%d, without_translator=%d)", nrow(sampled), nrow(samp_with), nrow(samp_without)))
-
-# Build English prompt directly from dnb_dc snippet
-build_prompt_from_dc <- function(dnb_dc_xml) {
-  # Truncate to keep prompts manageable
-  snippet <- dnb_dc_xml
-  max_chars <- 4000L
-  if (!is.na(snippet) && nchar(snippet, type = "chars") > max_chars) {
-    snippet <- substr(snippet, 1L, max_chars)
-  }
-  paste0(
-    "You are a bibliographic assistant. From the following OAI-DC metadata of a German National Library record, determine: \n",
-    "1) Whether the book is a translation.\n",
-    "2) If it is a translation: the translator's full name, the original title, and the original publication year.\n",
-    "3) If it is not a translation but seems to be a later edition (e.g., 2nd edition), try to infer the original publication year of the first edition.\n\n",
-    "Return a single JSON object only — no prose — with exactly these keys: \n",
-    "{\n  \"is_translation\": <boolean>,\n  \"translator_name\": <string|null>,\n  \"original_title\": <string|null>,\n  \"original_publication_year\": <integer|null>,\n  \"origin_confidence\": <number between 0 and 1>\n}\n",
-    "Use null for unknown values. Output JSON only.\n\n",
-    "OAI-DC metadata (XML):\n",
-    snippet
-  )
-}
-
-# Build English prompt from human-readable fields
-build_prompt_from_fields <- function(title = NA_character_, author = NA_character_, subTitle = NA_character_, translator = NA_character_, publisher = NA_character_, pubDate = NA_character_, ISBN = NA_character_) {
-  to_chr <- function(x) {
-    if (is.null(x) || length(x) == 0) {
-      return("")
-    }
-    y <- as.character(x)
-    if (length(y) == 0) {
-      return("")
-    }
-    y[is.na(y)] <- ""
-    y
-  }
-  title <- to_chr(title)
-  subTitle <- to_chr(subTitle)
-  author <- to_chr(author)
-  translator <- to_chr(translator)
-  publisher <- to_chr(publisher)
-  pubDate <- to_chr(pubDate)
-  ISBN <- to_chr(ISBN)
-  lines <- c(
-    if (nzchar(title)) sprintf("Title: %s", title) else NULL,
-    if (nzchar(subTitle)) sprintf("Subtitle: %s", subTitle) else NULL,
-    if (nzchar(author)) sprintf("Author: %s", author) else NULL,
-    if (nzchar(translator)) sprintf("Translator: %s", translator) else NULL,
-    if (nzchar(publisher)) sprintf("Publisher: %s", publisher) else NULL,
-    if (nzchar(pubDate)) sprintf("Publication year (German edition): %s", pubDate) else NULL,
-    if (nzchar(ISBN)) sprintf("ISBN: %s", ISBN) else NULL
-  )
-  context <- paste(lines, collapse = "\n")
-  paste0(
-    "You are a bibliographic assistant. For the following German-language book, determine:\n",
-    "1) Whether it is a translation.\n",
-    "2) If yes: the original title and the year of first publication.\n\n",
-    "Return a single JSON object only — no prose — with exactly these keys:\n",
-    "{\n  \"is_translation\": <boolean>,\n  \"translator_name\": <string|null>,\n  \"original_title\": <string|null>,\n  \"original_publication_year\": <integer|null>,\n  \"origin_confidence\": <number between 0 and 1>\n}\n",
-    "Use null for unknown. Output JSON only.\n\n",
-    "Metadata:\n",
-    context
-  )
-}
-
-# Models used for inference (replace Gemini with an Anthropic model)
-models <- c("claude-3-haiku-20240307", "gpt-4o-mini", "deepseek-chat")
-
-# Helper to build an empty one-row result with all model-prefixed columns
-empty_model_row <- function() {
-  vals <- list()
-  for (m in models) {
-    vals[[paste0(m, "_translator_name")]] <- NA_character_
-    vals[[paste0(m, "_original_title")]] <- NA_character_
-    vals[[paste0(m, "_original_publication_year")]] <- as.integer(NA)
-    vals[[paste0(m, "_origin_confidence")]] <- as.numeric(NA)
-  }
-  tibble::as_tibble(vals)
-}
-
-# Map model prefix to provider
-resolve_provider_for_model <- function(model) {
-  if (grepl("^gpt-", model, ignore.case = TRUE)) {
-    return(tidyllm::openai())
-  } else if (grepl("^claude-", model, ignore.case = TRUE)) {
-    return(tidyllm::claude())
-  } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
-    return(tidyllm::gemini())
-  } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
-    return(tidyllm::deepseek())
-  }
-  tidyllm::deepseek()
-}
-
-ensure_api_key_for_model <- function(model) {
-  if (grepl("^gpt-", model, ignore.case = TRUE)) {
-    if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) stop("OPENAI_API_KEY is not set")
-  } else if (grepl("^claude-", model, ignore.case = TRUE)) {
-    if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) stop("ANTHROPIC_API_KEY is not set")
-  } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
-    if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) stop("GOOGLE_API_KEY is not set")
-  } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
-    if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) stop("DEEPSEEK_API_KEY is not set")
-  }
-  invisible(TRUE)
-}
-
-inference_for_prompt <- function(prompt_text, model, temperature = 0.1, timeout = 60, max_tries = 3) {
-  ensure_api_key_for_model(model)
-  provider <- resolve_provider_for_model(model)
-  prompt <- prompt_text
-
-  res <- NULL
-  err <- NULL
-  tryCatch(
-    {
-      res <- tidyllm::llm_message(prompt) |>
-        tidyllm::chat(
-          .provider = provider,
-          .model = model,
-          .temperature = temperature,
-          .timeout = timeout,
-          .max_tries = max_tries
-        )
-    },
-    error = function(e) err <<- conditionMessage(e)
-  )
-
-  if (!is.null(err)) {
-    message(sprintf(
-      "LLM request failed (model=%s): %s",
-      model, err
-    ))
-    return(tibble::tibble(
-      translator_name = NA_character_,
-      original_title = NA_character_,
-      original_publication_year = as.integer(NA),
-      origin_confidence = as.numeric(NA)
-    ))
-  }
-
-  # Prefer JSON parsing via tidyllm helper; falls back to NA on failure
-  parsed <- NULL
-  suppressWarnings({
-    parsed <- tidyllm::get_reply_data(res)
-  })
-
-  if (is.null(parsed)) {
-    # Try to extract JSON manually as a very last resort
-    raw <- tidyllm::get_reply(res)
-    json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}")
-    if (!is.na(json_txt)) {
-      parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE)
-      if (inherits(parsed, "try-error")) parsed <- NULL
-    }
-    if (is.null(parsed) && tolower(Sys.getenv("DELIKO_DEBUG", "")) %in% c("1", "true", "yes", "y")) {
-      snippet <- substr(raw, 1L, 500L)
-      message(sprintf("Parse failed; raw reply snippet (model=%s): %s", model, gsub("\n", " ", snippet)))
-    }
-  }
-
-  if (is.null(parsed)) {
-    return(tibble::tibble(
-      translator_name = NA_character_,
-      original_title = NA_character_,
-      original_publication_year = as.integer(NA),
-      origin_confidence = as.numeric(NA)
-    ))
-  }
-
-  # parsed may be a data.frame/tibble or a named list
-  get_val <- function(obj, keys) {
-    for (k in keys) {
-      if (is.data.frame(obj) && k %in% names(obj)) {
-        return(obj[[k]][[1]])
-      }
-      if (is.list(obj) && !is.null(obj[[k]])) {
-        return(obj[[k]])
-      }
-    }
-    return(NA)
-  }
-
-  tr <- get_val(parsed, c("translator_name", "translator"))
-  ot <- get_val(parsed, c("original_title", "originalTitle", "title_original"))
-  opd <- suppressWarnings(as.integer(get_val(parsed, c("original_publication_year", "original_publication_date", "year", "year_of_first_publication"))))
-  cf <- suppressWarnings(as.numeric(get_val(parsed, c("origin_confidence", "confidence", "confidence_score"))))
-
-  if (!is.na(cf)) cf <- max(0, min(1, cf))
-
-  # Debug which keys were present
-  if (tolower(Sys.getenv("DELIKO_DEBUG", "")) %in% c("1", "true", "yes", "y")) {
-    present <- intersect(c("translator_name", "translator", "original_title", "originalTitle", "title_original", "original_publication_year", "original_publication_year", "year", "year_of_first_publication", "origin_confidence", "confidence", "confidence_score"), names(parsed))
-    message(sprintf("Parsed keys present (model=%s): %s", model, paste(present, collapse = ", ")))
-  }
-
-  tibble::tibble(
-    translator_name = ifelse(is.null(tr) || is.na(tr) || !nzchar(as.character(tr)), NA_character_, as.character(tr)),
-    original_title = ifelse(is.null(ot) || is.na(ot) || !nzchar(as.character(ot)), NA_character_, as.character(ot)),
-    original_publication_year = ifelse(is.na(opd), as.integer(NA), as.integer(opd)),
-    origin_confidence = ifelse(is.na(cf), as.numeric(NA), as.numeric(cf))
-  )
-}
-
-# Run LLMs in parallel for each row if possible
-# We use parallelization at the per-row level using future.apply if available; otherwise sequential.
-has_future <- (requireNamespace("future.apply", quietly = TRUE) && requireNamespace("future", quietly = TRUE))
-if (has_future) {
-  future::plan(future::multisession, workers = min(length(models), 3L))
-  on.exit(
-    {
-      try(future::plan(future::sequential), silent = TRUE)
-    },
-    add = TRUE
-  )
-}
-
-augment_row_with_models <- function(dnb_dc_xml) {
-  # Execute models in parallel if future.apply is available
-  run_model <- function(model) {
-    err_msg <- NULL
-    t0 <- Sys.time()
-    res <- tryCatch(
-      inference_for_prompt(dnb_dc_xml, model),
-      error = function(e) {
-        err_msg <<- conditionMessage(e)
-        tibble::tibble(
-          translator_name = NA_character_,
-          original_title = NA_character_,
-          original_publication_year = as.integer(NA),
-          origin_confidence = as.numeric(NA)
-        )
-      }
-    )
-    dt <- as.numeric(difftime(Sys.time(), t0, units = "secs"))
-    if (is.null(err_msg)) {
-      message(sprintf("LLM timing: model=%s, elapsed=%.2fs", model, dt))
-    } else {
-      message(sprintf("LLM timing: model=%s, elapsed=%.2fs, error=%s", model, dt, err_msg))
-    }
-    res
-  }
-  results <- if (has_future) {
-    future.apply::future_lapply(models, run_model)
-  } else {
-    lapply(models, run_model)
-  }
-  names(results) <- models
-
-  # Prefix columns with model name
-  out_list <- lapply(names(results), function(m) {
-    res <- results[[m]]
-    names(res) <- paste0(m, "_", names(res))
-    res
-  })
-  out <- tryCatch(dplyr::bind_cols(out_list), error = function(e) {
-    message(sprintf("bind_cols failed for model outputs: %s", conditionMessage(e)))
-    NULL
-  })
-  if (is.null(out)) out <- empty_model_row()
-  out
-}
-
-message(sprintf("Running models: %s; parallel=%s", paste(models, collapse = ", "), has_future))
-getv <- function(df, i, nm) if (nm %in% names(df)) df[[nm]][i] else NA_character_
-if (prompt_source == "text") {
-  prompts <- vapply(seq_len(nrow(sampled)), function(i) {
-    build_prompt_from_fields(
-      title = getv(sampled, i, "title"),
-      author = getv(sampled, i, "author"),
-      subTitle = getv(sampled, i, "subTitle"),
-      translator = getv(sampled, i, "translator"),
-      publisher = getv(sampled, i, "publisher"),
-      pubDate = getv(sampled, i, "pubDate"),
-      ISBN = getv(sampled, i, "ISBN")
-    )
-  }, character(1))
-} else {
-  prompts <- vapply(seq_len(nrow(sampled)), function(i) build_prompt_from_dc(sampled$dnb_dc[i]), character(1))
-}
-sampled$prompt_text <- prompts
-
-if (isTRUE(show_prompts)) {
-  message(sprintf("Printing %d prompts:", length(prompts)))
-  for (i in seq_along(prompts)) {
-    pid <- if ("id" %in% names(sampled)) as.character(sampled$id[i]) else "<no id>"
-    str <- if ("stratum" %in% names(sampled)) as.character(sampled$stratum[i]) else "<no stratum>"
-    message(sprintf("Prompt [%d/%d] id=%s stratum=%s:\n%s", i, length(prompts), pid, str, prompts[i]))
-  }
-}
-
-model_cols <- sampled %>%
-  mutate(.row = row_number()) %>%
-  split(.$.row) %>%
-  purrr::map(~ augment_row_with_models(.x$prompt_text[[1]])) %>%
-  dplyr::bind_rows()
-
-message(sprintf("sampled rows=%d, model_cols rows=%d, model_cols cols=%d", nrow(sampled), nrow(model_cols), ncol(model_cols)))
-
-# Ensure model_cols has same number of rows as sampled
-if (nrow(model_cols) != nrow(sampled)) {
-  message(sprintf("Adjusting model_cols rows from %d to %d", nrow(model_cols), nrow(sampled)))
-  if (nrow(model_cols) < nrow(sampled)) {
-    empty_row <- empty_model_row()
-    pad_list <- rep(list(empty_row), nrow(sampled) - nrow(model_cols))
-    pad <- dplyr::bind_rows(pad_list)
-    model_cols <- dplyr::bind_rows(model_cols, pad)
-  } else if (nrow(model_cols) > nrow(sampled)) {
-    model_cols <- dplyr::slice(model_cols, seq_len(nrow(sampled)))
-  }
-}
-
-# Add German title and publication year to the output
-title_de <- if ("title" %in% names(sampled)) as.character(sampled$title) else rep(NA_character_, nrow(sampled))
-pub_year_de <- if ("pubDate" %in% names(sampled)) suppressWarnings(as.integer(stringr::str_extract(as.character(sampled$pubDate), "\\\\d{4}"))) else rep(as.integer(NA), nrow(sampled))
-meta_cols <- tibble::tibble(title_de = title_de, pub_year_de = pub_year_de)
-out <- dplyr::bind_cols(sampled %>% select(id, stratum), meta_cols, model_cols)
-
-# Optional consensus columns (majority vote for title/name string equality; median for year/confidence)
-if (isTRUE(include_consensus)) {
-  # Helper to majority vote on strings (ignoring case/punctuation)
-  norm <- function(x) {
-    x <- tolower(trimws(x))
-    x <- gsub("[[:punct:]]+", "", x)
-    x[nzchar(x)]
-  }
-  # For translator names: keep punctuation, only trim and lowercase
-  norm_keep_punct <- function(x) {
-    x <- tolower(trimws(x))
-    x[nzchar(x)]
-  }
-  get_mode <- function(vals) {
-    vals <- vals[!is.na(vals) & nzchar(vals)]
-    if (length(vals) == 0) {
-      return(NA_character_)
-    }
-    tbl <- sort(table(vals), decreasing = TRUE)
-    names(tbl)[1]
-  }
-  get_median_int <- function(vals) {
-    vals <- vals[!is.na(vals)]
-    if (length(vals) == 0) {
-      return(as.integer(NA))
-    }
-    as.integer(median(vals))
-  }
-  get_mean_num <- function(vals) {
-    vals <- vals[!is.na(vals)]
-    if (length(vals) == 0) {
-      return(as.numeric(NA))
-    }
-    as.numeric(mean(vals))
-  }
-
-  out <- out %>%
-    rowwise() %>%
-    mutate(
-      consensus_translator_name = {
-        vals <- as.character(unlist(c_across(ends_with("_translator_name"))))
-        vals <- norm_keep_punct(vals)
-        if (length(vals) == 0) NA_character_ else get_mode(vals)
-      },
-      consensus_original_title = {
-        vals <- as.character(unlist(c_across(ends_with("_original_title"))))
-        vals <- norm(vals)
-        if (length(vals) == 0) NA_character_ else get_mode(vals)
-      },
-      consensus_original_publication_year = get_median_int(as.integer(unlist(c_across(ends_with("_original_publication_year"))))),
-      consensus_origin_confidence = get_mean_num(as.numeric(unlist(c_across(ends_with("_origin_confidence")))))
-    ) %>%
-    ungroup()
-}
-
-# Title-case consensus fields
-out <- out %>% mutate(
-  consensus_translator_name = ifelse(is.na(consensus_translator_name), NA_character_, stringr::str_to_title(consensus_translator_name)),
-  consensus_original_title = ifelse(is.na(consensus_original_title), NA_character_, stringr::str_to_title(consensus_original_title))
-)
-
-readr::write_tsv(out, opts$out)
-message("Wrote ", nrow(out), " rows to ", opts$out)
-
-# Inter-annotator agreement for original_publication_year
-year_cols <- names(out)[endsWith(names(out), "_original_publication_year")]
-if (length(year_cols) >= 2) {
-  years_mat <- as.matrix(out[year_cols])
-  # Pairwise exact match rate
-  pairs <- combn(seq_along(year_cols), 2, simplify = FALSE)
-  if (length(pairs) > 0) {
-    msg <- vapply(pairs, function(p) {
-      a <- years_mat[, p[1]]
-      b <- years_mat[, p[2]]
-      agree <- mean(a == b, na.rm = TRUE)
-      sprintf("%s vs %s: %.2f", year_cols[p[1]], year_cols[p[2]], agree)
-    }, character(1))
-    message("Inter-annotator exact agreement (year): ")
-    for (m in msg) message("  ", m)
-  }
-  # Fleiss' kappa if irr is available
-  if (requireNamespace("irr", quietly = TRUE)) {
-    # Convert to factors per row; irr::kappam.fleiss expects raters in columns
-    # Handle NAs by treating them as a separate level
-    yrs_df <- as.data.frame(years_mat)
-    for (j in seq_along(yrs_df)) yrs_df[[j]] <- as.factor(ifelse(is.na(yrs_df[[j]]), "NA", as.character(yrs_df[[j]])))
-    suppressWarnings({
-      kappa_res <- try(irr::kappam.fleiss(yrs_df), silent = TRUE)
-    })
-    if (!inherits(kappa_res, "try-error")) {
-      message(sprintf("Fleiss' kappa (year): %.3f", kappa_res$value))
-    }
-  }
-}
commit	af020463c1d0471227775b74b63ba8a8fa5ab9b8	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Oct 04 15:24:44 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Oct 04 15:25:08 2025 +0200
tree	1f8c12ba2e8c9f1af3e39e7509b68f658069e0c6
parent	875623e2e5735dab5b3a88336a7c3d2709121f12 [diff]