Actually run the code in in the LLM doc tests
Change-Id: I9293a0346350a7102c62d5904aa5840e1292b500
diff --git a/tests/testthat/test-readme-against-llm.R b/tests/testthat/test-readme-against-llm.R
index 2aa7137..93aed65 100644
--- a/tests/testthat/test-readme-against-llm.R
+++ b/tests/testthat/test-readme-against-llm.R
@@ -85,6 +85,45 @@
trimws(code)
}
+# Helper function to test code syntax
+test_code_syntax <- function(code) {
+ tryCatch({
+ parse(text = code)
+ TRUE
+ }, error = function(e) {
+ cat("Syntax error:", as.character(e), "\n")
+ FALSE
+ })
+}
+
+# Helper function to run code if RUN_LLM_CODE is set
+run_code_if_enabled <- function(code, test_name) {
+ if (nzchar(Sys.getenv("RUN_LLM_CODE")) && Sys.getenv("RUN_LLM_CODE") == "true") {
+ cat("Running generated code for", test_name, "...\n")
+ tryCatch({
+ result <- eval(parse(text = code))
+ cat("Code executed successfully. Result type:", class(result), "\n")
+ if (is.data.frame(result)) {
+ cat("Result dimensions:", nrow(result), "rows,", ncol(result), "columns\n")
+ if (nrow(result) > 0) {
+ cat("First few rows:\n")
+ print(head(result, 3))
+ }
+ } else {
+ cat("Result preview:\n")
+ print(result)
+ }
+ return(TRUE)
+ }, error = function(e) {
+ cat("Runtime error:", as.character(e), "\n")
+ return(FALSE)
+ })
+ } else {
+ cat("Skipping code execution (set RUN_LLM_CODE=true to enable)\n")
+ return(NA)
+ }
+}
+
test_that("GPT-4.1 mini can solve frequency query task with README guidance", {
skip_if_not(nzchar(Sys.getenv("OPENAI_API_KEY")), "OPENAI_API_KEY not set")
skip_if_not(!is.null(find_readme_path()), "Readme.md not found in current or parent directories")
@@ -110,18 +149,18 @@
expect_true(grepl("\\|>", generated_code) || grepl("%>%", generated_code),
"Generated code should use pipe operators")
- # Optional: Try to parse the generated code to check for syntax errors
- parsed_successfully <- tryCatch({
- parse(text = generated_code)
- TRUE
- }, error = function(e) {
- FALSE
- })
-
- expect_true(parsed_successfully, "Generated code should be syntactically valid R code")
+ # Test code syntax
+ syntax_valid <- test_code_syntax(generated_code)
+ expect_true(syntax_valid, "Generated code should be syntactically valid R code")
# Print the generated code for manual inspection
cat("Generated code:\n", generated_code, "\n")
+
+ # Run the code if RUN_LLM_CODE is set
+ execution_result <- run_code_if_enabled(generated_code, "frequency query")
+ if (!is.na(execution_result)) {
+ expect_true(execution_result, "Generated code should execute without runtime errors")
+ }
})
test_that("GPT-4.1 mini can solve collocation analysis task with README guidance", {
@@ -130,7 +169,7 @@
# Create the prompt for collocation analysis
prompt <- create_readme_prompt(
- "write R code to perform a collocation analysis for the word 'setzen' (looking for light verb constructions). The code should use the RKorAPClient package's collocationAnalysis function.",
+ "write R code to perform a collocation analysis for the lemma 'setzen'. The code should use the RKorAPClient package's collocationAnalysis function.",
"Write R code to perform collocation analysis for 'setzen' using RKorAPClient."
)
@@ -149,8 +188,18 @@
expect_true(grepl("leftContextSize|rightContextSize", generated_code),
"Generated code should include context size parameters")
+ # Test code syntax
+ syntax_valid <- test_code_syntax(generated_code)
+ expect_true(syntax_valid, "Generated code should be syntactically valid R code")
+
# Print the generated code for manual inspection
cat("Generated collocation analysis code:\n", generated_code, "\n")
+
+ # Run the code if RUN_LLM_CODE is set
+ execution_result <- run_code_if_enabled(generated_code, "collocation analysis")
+ if (!is.na(execution_result)) {
+ expect_true(execution_result, "Generated code should execute without runtime errors")
+ }
})
test_that("GPT-4.1 mini can solve corpus query task with README guidance", {
@@ -178,6 +227,16 @@
expect_true(grepl("\\|>", generated_code) || grepl("%>%", generated_code),
"Generated code should use pipe operators")
+ # Test code syntax
+ syntax_valid <- test_code_syntax(generated_code)
+ expect_true(syntax_valid, "Generated code should be syntactically valid R code")
+
# Print the generated code for manual inspection
cat("Generated corpus query code:\n", generated_code, "\n")
+
+ # Run the code if RUN_LLM_CODE is set
+ execution_result <- run_code_if_enabled(generated_code, "corpus query")
+ if (!is.na(execution_result)) {
+ expect_true(execution_result, "Generated code should execute without runtime errors")
+ }
})