Add script to add original publication metadata

commit: 1e0a5d087343139157f10df07059ad176bc33174 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sun Sep 28 13:43:52 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sun Sep 28 20:40:05 2025 +0200
tree: 11df18cce2d7189a6841f4ca6fb0b4c2ac5ac641
parent: 1bfd8304d6da39511bb73cbd57b9d1e04ae2a5cf [diff]
diff --git a/scripts/sample_origin_trace_duckdb.R b/scripts/sample_origin_trace_duckdb.R
new file mode 100755
index 0000000..29d588a
--- /dev/null
+++ b/scripts/sample_origin_trace_duckdb.R

@@ -0,0 +1,572 @@
+#!/usr/bin/env Rscript
+
+suppressPackageStartupMessages({
+  library(DBI)
+  library(duckdb)
+  library(dplyr)
+  library(purrr)
+  library(readr)
+  library(stringr)
+  library(tidyr)
+  library(tidyllm)
+})
+# Defaults (override via CLI key=value or ENV)
+defaults <- list(
+  db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb",
+  table = "deliko_metadata",
+  id_col = "ISBN", # stable identifier per record
+  dc_col = "dnb_dc", # column with DC XML
+  sample_size = "10", # total sample size (default)
+  seed = "42", # RNG seed for reproducibility
+  out = "sample_origin_trace.tsv", # output TSV path
+  include_consensus = "true", # whether to compute consensus columns
+  prompt_source = "text" # text | xml
+)
+
+# Parse key=value CLI args
+opts <- defaults
+args <- commandArgs(trailingOnly = TRUE)
+for (a in args) {
+  if (!grepl("=", a, fixed = TRUE)) next
+  kv <- strsplit(a, "=", fixed = TRUE)[[1]]
+  if (length(kv) == 2 && kv[1] %in% names(opts)) opts[[kv[1]]] <- kv[2]
+}
+
+# ENV overrides
+opts$db <- Sys.getenv("DELIKO_DUCKDB", opts$db)
+opts$table <- Sys.getenv("DELIKO_TABLE", opts$table)
+opts$id_col <- Sys.getenv("DELIKO_ID_COL", opts$id_col)
+opts$dc_col <- Sys.getenv("DELIKO_DC_COL", opts$dc_col)
+opts$sample_size <- Sys.getenv("DELIKO_SAMPLE_SIZE", opts$sample_size)
+opts$seed <- Sys.getenv("DELIKO_SAMPLE_SEED", opts$seed)
+opts$out <- Sys.getenv("DELIKO_SAMPLE_OUT", opts$out)
+opts$include_consensus <- Sys.getenv("DELIKO_INCLUDE_CONSENSUS", opts$include_consensus)
+opts$prompt_source <- Sys.getenv("DELIKO_PROMPT_SOURCE", opts$prompt_source)
+
+# Coerce and validate
+sample_size <- suppressWarnings(as.integer(opts$sample_size))
+if (is.na(sample_size) || sample_size <= 0) stop("Invalid sample_size: ", opts$sample_size)
+seed <- suppressWarnings(as.integer(opts$seed))
+if (is.na(seed)) seed <- 42L
+include_consensus <- tolower(opts$include_consensus) %in% c("true", "1", "yes", "y")
+prompt_source <- tolower(opts$prompt_source)
+if (!(prompt_source %in% c("text", "xml"))) prompt_source <- "text"
+message("Prompt source: ", prompt_source)
+
+if (!file.exists(opts$db)) stop("DuckDB not found: ", opts$db)
+
+# Use helper to access DuckDB robustly without managing connection lifecycle here
+duck_env <- new.env(parent = baseenv())
+sys.source("/home/kupietz/korap4dnb/epub2i5/scripts/duckdb2tibble.R", envir = duck_env)
+duckdb2tibble <- get("duckdb2tibble", envir = duck_env)
+
+# Resolve effective column names by reading zero rows
+message("Resolving columns from table: ", opts$table)
+df0 <- duckdb2tibble(
+  db = opts$db,
+  table = opts$table,
+  limit = 0
+)
+if (!is.data.frame(df0)) stop("Failed to read schema from table: ", opts$table)
+cols <- names(df0)
+message(sprintf("Loaded schema columns: %s", paste(cols, collapse = ", ")))
+# safe match that returns NA_character_ if not found
+safe_match_name <- function(name, pool) {
+  idx <- match(tolower(name), tolower(pool))
+  if (is.na(idx)) {
+    return(NA_character_)
+  }
+  pool[[idx]]
+}
+id_col_eff <- safe_match_name(opts$id_col, cols)
+dc_col_eff <- safe_match_name(opts$dc_col, cols)
+if (is.na(id_col_eff)) stop("ID column not found: ", opts$id_col)
+if (is.na(dc_col_eff)) stop("DC column not found: ", opts$dc_col)
+title_eff <- cols[match(tolower("title"), tolower(cols))]
+author_eff <- cols[match(tolower("author"), tolower(cols))]
+subTitle_eff <- cols[match(tolower("subTitle"), tolower(cols))]
+translator_eff <- cols[match(tolower("translator"), tolower(cols))]
+publisher_eff <- cols[match(tolower("publisher"), tolower(cols))]
+pubDate_eff <- cols[match(tolower("pubDate"), tolower(cols))]
+isbn_eff <- cols[match(tolower("ISBN"), tolower(cols))]
+
+# Build SQL with explicit quoting to handle case-sensitive identifiers
+sel <- c(
+  sprintf('"%s" AS id', id_col_eff),
+  sprintf('"%s" AS dnb_dc', dc_col_eff)
+)
+if (!is.na(title_eff)) sel <- c(sel, sprintf('"%s" AS title', title_eff))
+if (!is.na(pubDate_eff)) sel <- c(sel, sprintf('"%s" AS pubDate', pubDate_eff))
+if (prompt_source == "text") {
+  if (!is.na(author_eff)) sel <- c(sel, sprintf('"%s" AS author', author_eff))
+  if (!is.na(subTitle_eff)) sel <- c(sel, sprintf('"%s" AS subTitle', subTitle_eff))
+  if (!is.na(translator_eff)) sel <- c(sel, sprintf('"%s" AS translator', translator_eff))
+  if (!is.na(publisher_eff)) sel <- c(sel, sprintf('"%s" AS publisher', publisher_eff))
+  if (!is.na(isbn_eff)) sel <- c(sel, sprintf('"%s" AS ISBN', isbn_eff))
+}
+sql <- sprintf(
+  'SELECT %s FROM "main"."%s" WHERE "%s" IS NOT NULL AND length(trim("%s")) > 0',
+  paste(sel, collapse = ", "), opts$table, dc_col_eff, dc_col_eff
+)
+
+# Fetch minimal data needed for stratification
+df <- duckdb2tibble(db = opts$db, sql = sql)
+if (nrow(df) == 0) stop("No rows with non-empty ", dc_col_eff)
+
+# Stratify by presence of the German term "Übersetzer" in the dnb_dc payload
+df <- df %>% mutate(
+  stratum = if_else(str_detect(dnb_dc, fixed("Übersetzer", ignore_case = TRUE)), "with_translator", "without_translator")
+)
+
+# Compute per-stratum sizes (balanced rounding) under total sample_size
+# Helper for infix default used in size allocation
+`%||%` <- function(x, y) if (length(x) == 0 || is.na(x)) y else x
+set.seed(seed)
+tab <- df %>% count(stratum, name = "N")
+if (nrow(tab) == 1) {
+  n_with <- if (tab$stratum[1] == "with_translator") sample_size else 0L
+  n_without <- if (tab$stratum[1] == "without_translator") sample_size else 0L
+} else {
+  p_with <- tab$N[tab$stratum == "with_translator"] / nrow(df)
+  if (is.na(p_with)) p_with <- 0
+  n_with <- as.integer(round(sample_size * p_with))
+  n_with <- min(n_with, tab$N[tab$stratum == "with_translator"] %||% 0L)
+  n_without <- sample_size - n_with
+  max_without <- tab$N[tab$stratum == "without_translator"] %||% 0L
+  if (n_without > max_without) {
+    n_without <- max_without
+    # move leftover to with_translator if available
+    rem <- sample_size - (n_with + n_without)
+    max_with <- tab$N[tab$stratum == "with_translator"] %||% 0L
+    n_with <- min(n_with + max(rem, 0L), max_with)
+  }
+}
+
+# Compute subgroup data frames and sample sizes as constants
+with_df <- df %>% filter(stratum == "with_translator")
+without_df <- df %>% filter(stratum == "without_translator")
+want_with <- floor(sample_size / 2)
+want_without <- sample_size - want_with
+avail_with <- nrow(with_df)
+avail_without <- nrow(without_df)
+take_with <- min(want_with, avail_with)
+take_without <- min(want_without, avail_without)
+rem <- sample_size - (take_with + take_without)
+if (rem > 0) {
+  left_with <- max(avail_with - take_with, 0L)
+  add_from_with <- min(rem, left_with)
+  take_with <- take_with + add_from_with
+  rem <- rem - add_from_with
+  if (rem > 0) {
+    left_without <- max(avail_without - take_without, 0L)
+    add_from_without <- min(rem, left_without)
+    take_without <- take_without + add_from_without
+    rem <- rem - add_from_without
+  }
+}
+with_n <- take_with
+without_n <- take_without
+message(sprintf("Stratified target with=%d without=%d; available with=%d without=%d; chosen with=%d without=%d", want_with, want_without, avail_with, avail_without, with_n, without_n))
+
+samp_with <- dplyr::slice_sample(with_df, n = with_n, replace = FALSE)
+samp_without <- dplyr::slice_sample(without_df, n = without_n, replace = FALSE)
+sampled <- bind_rows(samp_with, samp_without) %>% arrange(stratum, id)
+
+message(sprintf("Sampled %d rows (with_translator=%d, without_translator=%d)", nrow(sampled), nrow(samp_with), nrow(samp_without)))
+
+# Build English prompt directly from dnb_dc snippet
+build_prompt_from_dc <- function(dnb_dc_xml) {
+  # Truncate to keep prompts manageable
+  snippet <- dnb_dc_xml
+  max_chars <- 4000L
+  if (!is.na(snippet) && nchar(snippet, type = "chars") > max_chars) {
+    snippet <- substr(snippet, 1L, max_chars)
+  }
+  paste0(
+    "You are a bibliographic assistant. From the following OAI-DC metadata of a German National Library record, determine: \n",
+    "1) Whether the book is a translation.\n",
+    "2) If it is a translation: the translator's full name, the original title, and the original publication year.\n",
+    "3) If it is not a translation but seems to be a later edition (e.g., 2nd edition), try to infer the original publication year of the first edition.\n\n",
+    "Return a single JSON object only — no prose — with exactly these keys: \n",
+    "{\n  \"is_translation\": <boolean>,\n  \"translator_name\": <string|null>,\n  \"original_title\": <string|null>,\n  \"original_publication_year\": <integer|null>,\n  \"origin_confidence\": <number between 0 and 1>\n}\n",
+    "Use null for unknown values. Output JSON only.\n\n",
+    "OAI-DC metadata (XML):\n",
+    snippet
+  )
+}
+
+# Build English prompt from human-readable fields
+build_prompt_from_fields <- function(title = NA_character_, author = NA_character_, subTitle = NA_character_, translator = NA_character_, publisher = NA_character_, pubDate = NA_character_, ISBN = NA_character_) {
+  to_chr <- function(x) {
+    if (is.null(x) || length(x) == 0) {
+      return("")
+    }
+    y <- as.character(x)
+    if (length(y) == 0) {
+      return("")
+    }
+    y[is.na(y)] <- ""
+    y
+  }
+  title <- to_chr(title)
+  subTitle <- to_chr(subTitle)
+  author <- to_chr(author)
+  translator <- to_chr(translator)
+  publisher <- to_chr(publisher)
+  pubDate <- to_chr(pubDate)
+  ISBN <- to_chr(ISBN)
+  lines <- c(
+    if (nzchar(title)) sprintf("Title: %s", title) else NULL,
+    if (nzchar(subTitle)) sprintf("Subtitle: %s", subTitle) else NULL,
+    if (nzchar(author)) sprintf("Author: %s", author) else NULL,
+    if (nzchar(translator)) sprintf("Translator: %s", translator) else NULL,
+    if (nzchar(publisher)) sprintf("Publisher: %s", publisher) else NULL,
+    if (nzchar(pubDate)) sprintf("Publication year (German edition): %s", pubDate) else NULL,
+    if (nzchar(ISBN)) sprintf("ISBN: %s", ISBN) else NULL
+  )
+  context <- paste(lines, collapse = "\n")
+  paste0(
+    "You are a bibliographic assistant. For the following German-language book, determine:\n",
+    "1) Whether it is a translation.\n",
+    "2) If yes: the original title and the year of first publication.\n\n",
+    "Return a single JSON object only — no prose — with exactly these keys:\n",
+    "{\n  \"is_translation\": <boolean>,\n  \"translator_name\": <string|null>,\n  \"original_title\": <string|null>,\n  \"original_publication_year\": <integer|null>,\n  \"origin_confidence\": <number between 0 and 1>\n}\n",
+    "Use null for unknown. Output JSON only.\n\n",
+    "Metadata:\n",
+    context
+  )
+}
+
+# Models used for inference (replace Gemini with an Anthropic model)
+models <- c("claude-3-haiku-20240307", "gpt-4o-mini", "deepseek-chat")
+
+# Helper to build an empty one-row result with all model-prefixed columns
+empty_model_row <- function() {
+  vals <- list()
+  for (m in models) {
+    vals[[paste0(m, "_translator_name")]] <- NA_character_
+    vals[[paste0(m, "_original_title")]] <- NA_character_
+    vals[[paste0(m, "_original_publication_year")]] <- as.integer(NA)
+    vals[[paste0(m, "_origin_confidence")]] <- as.numeric(NA)
+  }
+  tibble::as_tibble(vals)
+}
+
+# Map model prefix to provider
+resolve_provider_for_model <- function(model) {
+  if (grepl("^gpt-", model, ignore.case = TRUE)) {
+    return(tidyllm::openai())
+  } else if (grepl("^claude-", model, ignore.case = TRUE)) {
+    return(tidyllm::claude())
+  } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
+    return(tidyllm::gemini())
+  } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
+    return(tidyllm::deepseek())
+  }
+  tidyllm::deepseek()
+}
+
+ensure_api_key_for_model <- function(model) {
+  if (grepl("^gpt-", model, ignore.case = TRUE)) {
+    if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) stop("OPENAI_API_KEY is not set")
+  } else if (grepl("^claude-", model, ignore.case = TRUE)) {
+    if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) stop("ANTHROPIC_API_KEY is not set")
+  } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
+    if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) stop("GOOGLE_API_KEY is not set")
+  } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
+    if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) stop("DEEPSEEK_API_KEY is not set")
+  }
+  invisible(TRUE)
+}
+
+inference_for_prompt <- function(prompt_text, model, temperature = 0.1, timeout = 60, max_tries = 3) {
+  ensure_api_key_for_model(model)
+  provider <- resolve_provider_for_model(model)
+  prompt <- prompt_text
+
+  res <- NULL
+  err <- NULL
+  tryCatch(
+    {
+      res <- tidyllm::llm_message(prompt) |>
+        tidyllm::chat(
+          .provider = provider,
+          .model = model,
+          .temperature = temperature,
+          .timeout = timeout,
+          .max_tries = max_tries
+        )
+    },
+    error = function(e) err <<- conditionMessage(e)
+  )
+
+  if (!is.null(err)) {
+    message(sprintf(
+      "LLM request failed (model=%s): %s",
+      model, err
+    ))
+    return(tibble::tibble(
+      translator_name = NA_character_,
+      original_title = NA_character_,
+      original_publication_year = as.integer(NA),
+      origin_confidence = as.numeric(NA)
+    ))
+  }
+
+  # Prefer JSON parsing via tidyllm helper; falls back to NA on failure
+  parsed <- NULL
+  suppressWarnings({
+    parsed <- tidyllm::get_reply_data(res)
+  })
+
+  if (is.null(parsed)) {
+    # Try to extract JSON manually as a very last resort
+    raw <- tidyllm::get_reply(res)
+    json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}")
+    if (!is.na(json_txt)) {
+      parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE)
+      if (inherits(parsed, "try-error")) parsed <- NULL
+    }
+    if (is.null(parsed) && tolower(Sys.getenv("DELIKO_DEBUG", "")) %in% c("1", "true", "yes", "y")) {
+      snippet <- substr(raw, 1L, 500L)
+      message(sprintf("Parse failed; raw reply snippet (model=%s): %s", model, gsub("\n", " ", snippet)))
+    }
+  }
+
+  if (is.null(parsed)) {
+    return(tibble::tibble(
+      translator_name = NA_character_,
+      original_title = NA_character_,
+      original_publication_year = as.integer(NA),
+      origin_confidence = as.numeric(NA)
+    ))
+  }
+
+  # parsed may be a data.frame/tibble or a named list
+  get_val <- function(obj, keys) {
+    for (k in keys) {
+      if (is.data.frame(obj) && k %in% names(obj)) {
+        return(obj[[k]][[1]])
+      }
+      if (is.list(obj) && !is.null(obj[[k]])) {
+        return(obj[[k]])
+      }
+    }
+    return(NA)
+  }
+
+  tr <- get_val(parsed, c("translator_name", "translator"))
+  ot <- get_val(parsed, c("original_title", "originalTitle", "title_original"))
+  opd <- suppressWarnings(as.integer(get_val(parsed, c("original_publication_year", "original_publication_date", "year", "year_of_first_publication"))))
+  cf <- suppressWarnings(as.numeric(get_val(parsed, c("origin_confidence", "confidence", "confidence_score"))))
+
+  if (!is.na(cf)) cf <- max(0, min(1, cf))
+
+  # Debug which keys were present
+  if (tolower(Sys.getenv("DELIKO_DEBUG", "")) %in% c("1", "true", "yes", "y")) {
+    present <- intersect(c("translator_name", "translator", "original_title", "originalTitle", "title_original", "original_publication_year", "original_publication_year", "year", "year_of_first_publication", "origin_confidence", "confidence", "confidence_score"), names(parsed))
+    message(sprintf("Parsed keys present (model=%s): %s", model, paste(present, collapse = ", ")))
+  }
+
+  tibble::tibble(
+    translator_name = ifelse(is.null(tr) || is.na(tr) || !nzchar(as.character(tr)), NA_character_, as.character(tr)),
+    original_title = ifelse(is.null(ot) || is.na(ot) || !nzchar(as.character(ot)), NA_character_, as.character(ot)),
+    original_publication_year = ifelse(is.na(opd), as.integer(NA), as.integer(opd)),
+    origin_confidence = ifelse(is.na(cf), as.numeric(NA), as.numeric(cf))
+  )
+}
+
+# Run LLMs in parallel for each row if possible
+# We use parallelization at the per-row level using future.apply if available; otherwise sequential.
+has_future <- (requireNamespace("future.apply", quietly = TRUE) && requireNamespace("future", quietly = TRUE))
+if (has_future) {
+  future::plan(future::multisession, workers = min(length(models), 3L))
+  on.exit(
+    {
+      try(future::plan(future::sequential), silent = TRUE)
+    },
+    add = TRUE
+  )
+}
+
+augment_row_with_models <- function(dnb_dc_xml) {
+  # Execute models in parallel if future.apply is available
+  run_model <- function(model) {
+    err_msg <- NULL
+    t0 <- Sys.time()
+    res <- tryCatch(
+      inference_for_prompt(dnb_dc_xml, model),
+      error = function(e) {
+        err_msg <<- conditionMessage(e)
+        tibble::tibble(
+          translator_name = NA_character_,
+          original_title = NA_character_,
+          original_publication_year = as.integer(NA),
+          origin_confidence = as.numeric(NA)
+        )
+      }
+    )
+    dt <- as.numeric(difftime(Sys.time(), t0, units = "secs"))
+    if (is.null(err_msg)) {
+      message(sprintf("LLM timing: model=%s, elapsed=%.2fs", model, dt))
+    } else {
+      message(sprintf("LLM timing: model=%s, elapsed=%.2fs, error=%s", model, dt, err_msg))
+    }
+    res
+  }
+  results <- if (has_future) {
+    future.apply::future_lapply(models, run_model)
+  } else {
+    lapply(models, run_model)
+  }
+  names(results) <- models
+
+  # Prefix columns with model name
+  out_list <- lapply(names(results), function(m) {
+    res <- results[[m]]
+    names(res) <- paste0(m, "_", names(res))
+    res
+  })
+  out <- tryCatch(dplyr::bind_cols(out_list), error = function(e) {
+    message(sprintf("bind_cols failed for model outputs: %s", conditionMessage(e)))
+    NULL
+  })
+  if (is.null(out)) out <- empty_model_row()
+  out
+}
+
+message(sprintf("Running models: %s; parallel=%s", paste(models, collapse = ", "), has_future))
+getv <- function(df, i, nm) if (nm %in% names(df)) df[[nm]][i] else NA_character_
+if (prompt_source == "text") {
+  prompts <- vapply(seq_len(nrow(sampled)), function(i) {
+    build_prompt_from_fields(
+      title = getv(sampled, i, "title"),
+      author = getv(sampled, i, "author"),
+      subTitle = getv(sampled, i, "subTitle"),
+      translator = getv(sampled, i, "translator"),
+      publisher = getv(sampled, i, "publisher"),
+      pubDate = getv(sampled, i, "pubDate"),
+      ISBN = getv(sampled, i, "ISBN")
+    )
+  }, character(1))
+} else {
+  prompts <- vapply(seq_len(nrow(sampled)), function(i) build_prompt_from_dc(sampled$dnb_dc[i]), character(1))
+}
+sampled$prompt_text <- prompts
+
+model_cols <- sampled %>%
+  mutate(.row = row_number()) %>%
+  split(.$.row) %>%
+  purrr::map(~ augment_row_with_models(.x$prompt_text[[1]])) %>%
+  dplyr::bind_rows()
+
+message(sprintf("sampled rows=%d, model_cols rows=%d, model_cols cols=%d", nrow(sampled), nrow(model_cols), ncol(model_cols)))
+
+# Ensure model_cols has same number of rows as sampled
+if (nrow(model_cols) != nrow(sampled)) {
+  message(sprintf("Adjusting model_cols rows from %d to %d", nrow(model_cols), nrow(sampled)))
+  if (nrow(model_cols) < nrow(sampled)) {
+    empty_row <- empty_model_row()
+    pad_list <- rep(list(empty_row), nrow(sampled) - nrow(model_cols))
+    pad <- dplyr::bind_rows(pad_list)
+    model_cols <- dplyr::bind_rows(model_cols, pad)
+  } else if (nrow(model_cols) > nrow(sampled)) {
+    model_cols <- dplyr::slice(model_cols, seq_len(nrow(sampled)))
+  }
+}
+
+# Add German title and publication year to the output
+title_de <- if ("title" %in% names(sampled)) as.character(sampled$title) else rep(NA_character_, nrow(sampled))
+pub_year_de <- if ("pubDate" %in% names(sampled)) suppressWarnings(as.integer(stringr::str_extract(as.character(sampled$pubDate), "\\\\d{4}"))) else rep(as.integer(NA), nrow(sampled))
+meta_cols <- tibble::tibble(title_de = title_de, pub_year_de = pub_year_de)
+out <- dplyr::bind_cols(sampled %>% select(id, stratum), meta_cols, model_cols)
+
+# Optional consensus columns (majority vote for title/name string equality; median for year/confidence)
+if (isTRUE(include_consensus)) {
+  # Helper to majority vote on strings (ignoring case/punctuation)
+  norm <- function(x) {
+    x <- tolower(trimws(x))
+    x <- gsub("[[:punct:]]+", "", x)
+    x[nzchar(x)]
+  }
+  get_mode <- function(vals) {
+    vals <- vals[!is.na(vals) & nzchar(vals)]
+    if (length(vals) == 0) {
+      return(NA_character_)
+    }
+    tbl <- sort(table(vals), decreasing = TRUE)
+    names(tbl)[1]
+  }
+  get_median_int <- function(vals) {
+    vals <- vals[!is.na(vals)]
+    if (length(vals) == 0) {
+      return(as.integer(NA))
+    }
+    as.integer(median(vals))
+  }
+  get_mean_num <- function(vals) {
+    vals <- vals[!is.na(vals)]
+    if (length(vals) == 0) {
+      return(as.numeric(NA))
+    }
+    as.numeric(mean(vals))
+  }
+
+  out <- out %>%
+    rowwise() %>%
+    mutate(
+      consensus_translator_name = {
+        vals <- as.character(unlist(c_across(ends_with("_translator_name"))))
+        vals <- norm(vals)
+        if (length(vals) == 0) NA_character_ else get_mode(vals)
+      },
+      consensus_original_title = {
+        vals <- as.character(unlist(c_across(ends_with("_original_title"))))
+        vals <- norm(vals)
+        if (length(vals) == 0) NA_character_ else get_mode(vals)
+      },
+      consensus_original_publication_year = get_median_int(as.integer(unlist(c_across(ends_with("_original_publication_year"))))),
+      consensus_origin_confidence = get_mean_num(as.numeric(unlist(c_across(ends_with("_origin_confidence")))))
+    ) %>%
+    ungroup()
+}
+
+# Title-case consensus fields
+out <- out %>% mutate(
+  consensus_translator_name = ifelse(is.na(consensus_translator_name), NA_character_, stringr::str_to_title(consensus_translator_name)),
+  consensus_original_title = ifelse(is.na(consensus_original_title), NA_character_, stringr::str_to_title(consensus_original_title))
+)
+
+readr::write_tsv(out, opts$out)
+message("Wrote ", nrow(out), " rows to ", opts$out)
+
+# Inter-annotator agreement for original_publication_year
+year_cols <- names(out)[endsWith(names(out), "_original_publication_year")]
+if (length(year_cols) >= 2) {
+  years_mat <- as.matrix(out[year_cols])
+  # Pairwise exact match rate
+  pairs <- combn(seq_along(year_cols), 2, simplify = FALSE)
+  if (length(pairs) > 0) {
+    msg <- vapply(pairs, function(p) {
+      a <- years_mat[, p[1]]
+      b <- years_mat[, p[2]]
+      agree <- mean(a == b, na.rm = TRUE)
+      sprintf("%s vs %s: %.2f", year_cols[p[1]], year_cols[p[2]], agree)
+    }, character(1))
+    message("Inter-annotator exact agreement (year): ")
+    for (m in msg) message("  ", m)
+  }
+  # Fleiss' kappa if irr is available
+  if (requireNamespace("irr", quietly = TRUE)) {
+    # Convert to factors per row; irr::kappam.fleiss expects raters in columns
+    # Handle NAs by treating them as a separate level
+    yrs_df <- as.data.frame(years_mat)
+    for (j in seq_along(yrs_df)) yrs_df[[j]] <- as.factor(ifelse(is.na(yrs_df[[j]]), "NA", as.character(yrs_df[[j]])))
+    suppressWarnings({
+      kappa_res <- try(irr::kappam.fleiss(yrs_df), silent = TRUE)
+    })
+    if (!inherits(kappa_res, "try-error")) {
+      message(sprintf("Fleiss' kappa (year): %.3f", kappa_res$value))
+    }
+  }
+}
commit	1e0a5d087343139157f10df07059ad176bc33174	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Sep 28 13:43:52 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Sep 28 20:40:05 2025 +0200
tree	11df18cce2d7189a6841f4ca6fb0b4c2ac5ac641
parent	1bfd8304d6da39511bb73cbd57b9d1e04ae2a5cf [diff]