blob: 490338c337d07d423af7a864cc5229876b51c803 [file] [log] [blame]
Marc Kupietz9b526922025-09-28 20:42:34 +02001#!/usr/bin/env Rscript
2
3suppressPackageStartupMessages({
4 library(DBI)
5 library(duckdb)
6 library(tibble)
7})
8
9# Public function: safe to source in RStudio
10# Returns a tibble from DuckDB using either a table name or a SQL query
11duckdb2tibble <- function(
12 db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb",
13 table = "deliko_metadata",
14 sql = "",
15 limit = 0,
16 read_only = TRUE) {
17 stopifnot(is.character(db), length(db) == 1)
18 if (!file.exists(db)) stop("DuckDB not found: ", db)
19
20 drv <- duckdb::duckdb()
21 con <- DBI::dbConnect(drv, dbdir = db, read_only = isTRUE(read_only))
22 on.exit(
23 {
24 try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE)
25 try(duckdb::duckdb_shutdown(drv), silent = TRUE)
26 },
27 add = TRUE
28 )
29
30 if (nzchar(sql)) {
31 res <- DBI::dbGetQuery(con, sql)
32 return(tibble::as_tibble(res))
33 }
34
35 q_schema <- DBI::dbQuoteIdentifier(con, "main")
36 q_table <- DBI::dbQuoteIdentifier(con, table)
37 fq_name <- paste0(as.character(q_schema), ".", as.character(q_table))
38 q <- paste0("SELECT * FROM ", fq_name)
39 if (is.numeric(limit) && limit > 0) q <- paste0(q, " LIMIT ", as.integer(limit))
40
41 tibble::as_tibble(DBI::dbGetQuery(con, q))
42}
43
44# CLI mode only: run when invoked via Rscript
45if (!interactive()) {
46 defaults <- list(
47 db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb",
48 table = "deliko_metadata",
49 sql = "",
50 limit = "0",
51 output = ""
52 )
53 opts <- defaults
54 args <- commandArgs(trailingOnly = TRUE)
55 for (a in args) {
56 if (!grepl("=", a, fixed = TRUE)) next
57 kv <- strsplit(a, "=", fixed = TRUE)[[1]]
58 if (length(kv) == 2 && kv[1] %in% names(opts)) opts[[kv[1]]] <- kv[2]
59 }
60
61 # ENV overrides
62 opts$db <- Sys.getenv("DELIKO_DUCKDB", opts$db)
63 opts$table <- Sys.getenv("DELIKO_TABLE", opts$table)
64 opts$sql <- Sys.getenv("DELIKO_SQL", opts$sql)
65 opts$limit <- Sys.getenv("LIMIT", opts$limit)
66 opts$output <- Sys.getenv("OUT_RDS", opts$output)
67
68 message("Connecting DuckDB: ", opts$db)
69 df <- duckdb2tibble(
70 db = opts$db,
71 table = opts$table,
72 sql = opts$sql,
73 limit = suppressWarnings(as.integer(opts$limit)),
74 read_only = TRUE
75 )
76
77 message("Loaded tibble: ", nrow(df), " rows x ", ncol(df), " cols")
78 message("Columns: ", paste(names(df), collapse = ", "))
79 print(utils::head(df, n = if (nrow(df) > 10) 10 else nrow(df)))
80
81 if (nzchar(opts$output)) {
82 saveRDS(df, opts$output)
83 message("Saved tibble to ", opts$output)
84 }
85}