Blame - tests/testthat/test-fetchAnnotations.R - KorAP/RKorAPClient

blob: edc415b711ac75f246ba258f8ad7bb90869fd812 [file] [log] [blame]

Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1	test_that("fetchAnnotations works with valid matches", {
				2	skip_if_offline()
				3
				4	kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
				5	q <- kco %>%
				6	corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
				7	fetchNext(maxFetch = 2)
				8
				9	# Skip test if no matches found
				10	skip_if(is.null(q@collectedMatches) \|\| nrow(q@collectedMatches) == 0, "No matches found for test query")
				11
				12	# Test that structured annotation columns are initially absent
				13	expect_false("atokens" %in% colnames(q@collectedMatches))
				14	expect_false("pos" %in% colnames(q@collectedMatches))
				15
				16	# Test that matchID is preserved in collectedMatches
				17	expect_true("matchID" %in% colnames(q@collectedMatches))
				18	expect_true(all(!is.na(q@collectedMatches$matchID)))
				19
				20	# Test fetchAnnotations with default foundry
				21	q_with_annotations <- fetchAnnotations(q, verbose = FALSE)
				22
				23	# Check that structured annotation columns are now populated
				24	expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches))
				25	expect_true("pos" %in% colnames(q_with_annotations@collectedMatches))
				26
				27	# Check that the structured columns have left/match/right components
				28	expect_true(all(c("left", "match", "right") %in% names(q_with_annotations@collectedMatches$atokens)))
				29	expect_true(all(c("left", "match", "right") %in% names(q_with_annotations@collectedMatches$pos)))
				30
				31	# Test fetchAnnotations with specific foundry
				32	q_with_tt <- fetchAnnotations(q, foundry = "tt", verbose = FALSE)
				33	expect_true("atokens" %in% colnames(q_with_tt@collectedMatches))
				34	expect_true("pos" %in% colnames(q_with_tt@collectedMatches))
				35
				36	# Test that annotations contain actual content (regression test for URL construction)
				37	if (nrow(q_with_tt@collectedMatches) > 0) {
				38	# Check that the first match has populated annotation data
				39	expect_true(length(q_with_tt@collectedMatches$atokens$left[[1]]) > 0 \|\|
				40	length(q_with_tt@collectedMatches$atokens$match[[1]]) > 0 \|\|
				41	length(q_with_tt@collectedMatches$atokens$right[[1]]) > 0)
				42	expect_true(length(q_with_tt@collectedMatches$pos$left[[1]]) > 0 \|\|
				43	length(q_with_tt@collectedMatches$pos$match[[1]]) > 0 \|\|
				44	length(q_with_tt@collectedMatches$pos$right[[1]]) > 0)
				45	}
				46	})
				47
				48	test_that("fetchAnnotations handles empty matches gracefully", {
				49	kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
				50
				51	# Create a query object with no collected matches
				52	q <- KorAPQuery(
				53	korapConnection = kco,
				54	collectedMatches = NULL
				55	)
				56
				57	# Should warn and return original object
				58	expect_warning(
				59	result <- fetchAnnotations(q, verbose = FALSE),
				60	"No collected matches found"
				61	)
				62	expect_identical(result, q)
				63	})
				64
				65	test_that("fetchAnnotations preserves original object structure", {
				66	skip_if_offline()
				67
				68	kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
				69	q <- kco %>%
				70	corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
				71	fetchNext(maxFetch = 1)
				72
				73	# Skip test if no matches found
				74	skip_if(is.null(q@collectedMatches) \|\| nrow(q@collectedMatches) == 0, "No matches found for test query")
				75
				76	q_original <- q
				77	q_with_annotations <- fetchAnnotations(q, verbose = FALSE)
				78
				79	# Check that all original slots are preserved
				80	expect_identical(q_with_annotations@korapConnection, q_original@korapConnection)
				81	expect_identical(q_with_annotations@request, q_original@request)
				82	expect_identical(q_with_annotations@vc, q_original@vc)
				83	expect_identical(q_with_annotations@totalResults, q_original@totalResults)
				84
				85	# collectedMatches should have additional annotation columns
				86	expect_true(nrow(q_with_annotations@collectedMatches) == nrow(q_original@collectedMatches))
				87	expect_true(ncol(q_with_annotations@collectedMatches) > ncol(q_original@collectedMatches))
				88
				89	# Original columns should be preserved
				90	original_cols <- colnames(q_original@collectedMatches)
				91	expect_true(all(original_cols %in% colnames(q_with_annotations@collectedMatches)))
				92
				93	# New annotation columns should be present
				94	expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches))
				95	expect_true("pos" %in% colnames(q_with_annotations@collectedMatches))
				96	})
				97
				98	test_that("fetchAnnotations returns structured left/match/right format", {
				99	skip_if_offline()
				100
				101	kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
				102	q <- kco %>%
				103	corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
				104	fetchNext(maxFetch = 1)
				105
				106	# Skip test if no matches found
				107	skip_if(is.null(q@collectedMatches) \|\| nrow(q@collectedMatches) == 0, "No matches found for test query")
				108
				109	q_with_annotations <- fetchAnnotations(q, foundry = "tt", verbose = FALSE)
				110
				111	# Test that structured annotation columns exist
				112	expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches))
				113	expect_true("pos" %in% colnames(q_with_annotations@collectedMatches))
				114
				115	# Test the structure of annotation columns
				116	atokens <- q_with_annotations@collectedMatches$atokens
				117	pos <- q_with_annotations@collectedMatches$pos
				118
				119	expect_true(is.data.frame(atokens))
				120	expect_true(is.data.frame(pos))
				121
				122	expect_true(all(c("left", "match", "right") %in% names(atokens)))
				123	expect_true(all(c("left", "match", "right") %in% names(pos)))
				124
				125	# Test that each component is a list column
				126	expect_true(is.list(atokens$left))
				127	expect_true(is.list(atokens$match))
				128	expect_true(is.list(atokens$right))
				129	expect_true(is.list(pos$left))
				130	expect_true(is.list(pos$match))
				131	expect_true(is.list(pos$right))
				132
				133	# Test that the first match has actual data
				134	if (nrow(q_with_annotations@collectedMatches) > 0) {
				135	# At least one of left/match/right should have content
				136	total_tokens <- length(atokens$left[[1]]) + length(atokens$match[[1]]) + length(atokens$right[[1]])
				137	expect_true(total_tokens > 0)
				138
				139	total_pos <- length(pos$left[[1]]) + length(pos$match[[1]]) + length(pos$right[[1]])
				140	expect_true(total_pos > 0)
				141
				142	# Token count should match POS count
				143	expect_equal(total_tokens, total_pos)
				144
				145	# Match tokens should not be empty (since we found a match)
				146	expect_true(length(atokens$match[[1]]) > 0)
				147	expect_true(length(pos$match[[1]]) > 0)
				148	}
				149	})
				150
				151	test_that("matchID is preserved in collectedMatches", {
				152	skip_if_offline()
				153
				154	kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
				155	q <- kco %>%
				156	corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
				157	fetchNext(maxFetch = 1)
				158
				159	# Skip test if no matches found
				160	skip_if(is.null(q@collectedMatches) \|\| nrow(q@collectedMatches) == 0, "No matches found for test query")
				161
				162	# Check that matchID is present and valid
				163	expect_true("matchID" %in% colnames(q@collectedMatches))
				164	expect_true(all(!is.na(q@collectedMatches$matchID)))
				165
				166	# Verify matchID format (should contain position information)
				167	expect_true(all(grepl("-p\\d+-\\d+", q@collectedMatches$matchID)))
				168
				169	# Verify that matchStart and matchEnd are correctly extracted from matchID
				170	for (i in seq_len(nrow(q@collectedMatches))) {
				171	match_id <- q@collectedMatches$matchID[i]
				172	positions <- gsub(".-p(\\d+)-(\\d+).", "\\1 \\2", match_id)
				173	expected_start <- as.integer(stringr::word(positions, 1))
				174	expected_end <- as.integer(stringr::word(positions, 2)) - 1
				175
				176	expect_equal(q@collectedMatches$matchStart[i], expected_start)
				177	expect_equal(q@collectedMatches$matchEnd[i], expected_end)
				178	}
				179	})
Marc Kupietz	7ff770e	2025-07-18 19:07:10 +0200	[diff] [blame^]	180
				181	test_that("fetchAnnotations handles morphological annotations with pipe separators", {
				182	skip_if_offline()
				183
				184	kco <- KorAPConnection("https://korap.dnb.de", verbose = FALSE, cache = FALSE, accessToken = NULL)
				185	q <- kco %>%
				186	auth() %>%
				187	corpusQuery("Ameisenplage", metadataOnly = FALSE) %>%
				188	fetchNext(maxFetch = 1)
				189
				190	# Skip test if no matches found
				191	skip_if(is.null(q@collectedMatches) \|\| nrow(q@collectedMatches) == 0, "No matches found for test query")
				192
				193	# Test with marmot foundry which provides morphological annotations
				194	q_with_morph <- fetchAnnotations(q, foundry = "marmot", verbose = FALSE)
				195
				196	# Check that morphological annotation columns exist
				197	expect_true("morph" %in% colnames(q_with_morph@collectedMatches))
				198	expect_true("atokens" %in% colnames(q_with_morph@collectedMatches))
				199
				200	# Test the structure of morphological annotation columns
				201	morph <- q_with_morph@collectedMatches$morph
				202	expect_true(is.data.frame(morph))
				203	expect_true(all(c("left", "match", "right") %in% names(morph)))
				204	expect_true(is.list(morph$match))
				205
				206	# Test that morphological features use pipe separators
				207	if (nrow(q_with_morph@collectedMatches) > 0) {
				208	morph_data <- morph$match[[1]]
				209
				210	# Check that we have morphological data
				211	expect_true(length(morph_data) > 0)
				212
				213	# If morphological data exists and is not NA, it should contain pipe separators
				214	# for multiple features (e.g., "case:acc\|gender:fem\|number:sg")
				215	if (!is.na(morph_data[1]) && nchar(morph_data[1]) > 0) {
				216	# Should contain morphological features separated by pipes
				217	expect_true(grepl("^[^\|]+", morph_data[1])) # At least one feature
				218
				219	# If multiple features exist, they should be pipe-separated
				220	if (grepl("\\\|", morph_data[1])) {
				221	features <- unlist(strsplit(morph_data[1], "\\\|"))
				222	expect_true(length(features) > 1)
				223	# Each feature should follow the pattern "attribute:value"
				224	expect_true(all(grepl("^[^:]+:[^:]+$", features)))
				225	}
				226	}
				227	}
				228	})