blob: 3ff7c2c6ab33e6b17b36640087aef53bb64ee75c [file] [log] [blame]
test_that("fetchAnnotations works with valid matches", {
skip_if_offline()
kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
q <- kco %>%
corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
fetchNext(maxFetch = 2)
# Skip test if no matches found
skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
# Test that structured annotation columns are initially absent
expect_false("atokens" %in% colnames(q@collectedMatches))
expect_false("pos" %in% colnames(q@collectedMatches))
# Test that matchID is preserved in collectedMatches
expect_true("matchID" %in% colnames(q@collectedMatches))
expect_true(all(!is.na(q@collectedMatches$matchID)))
# Test fetchAnnotations with default foundry
q_with_annotations <- fetchAnnotations(q, verbose = FALSE)
# Check that structured annotation columns are now populated
expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches))
expect_true("pos" %in% colnames(q_with_annotations@collectedMatches))
# Check that the structured columns have left/match/right components
expect_true(all(c("left", "match", "right") %in% names(q_with_annotations@collectedMatches$atokens)))
expect_true(all(c("left", "match", "right") %in% names(q_with_annotations@collectedMatches$pos)))
# Test fetchAnnotations with specific foundry
q_with_tt <- fetchAnnotations(q, foundry = "tt", verbose = FALSE)
expect_true("atokens" %in% colnames(q_with_tt@collectedMatches))
expect_true("pos" %in% colnames(q_with_tt@collectedMatches))
# Test that annotations contain actual content (regression test for URL construction)
if (nrow(q_with_tt@collectedMatches) > 0) {
# Check that the first match has populated annotation data
expect_true(length(q_with_tt@collectedMatches$atokens$left[[1]]) > 0 ||
length(q_with_tt@collectedMatches$atokens$match[[1]]) > 0 ||
length(q_with_tt@collectedMatches$atokens$right[[1]]) > 0)
expect_true(length(q_with_tt@collectedMatches$pos$left[[1]]) > 0 ||
length(q_with_tt@collectedMatches$pos$match[[1]]) > 0 ||
length(q_with_tt@collectedMatches$pos$right[[1]]) > 0)
}
})
test_that("fetchAnnotations handles empty matches gracefully", {
kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
# Create a query object with no collected matches
q <- KorAPQuery(
korapConnection = kco,
collectedMatches = NULL
)
# Should warn and return original object
expect_warning(
result <- fetchAnnotations(q, verbose = FALSE),
"No collected matches found"
)
expect_identical(result, q)
})
test_that("fetchAnnotations preserves original object structure", {
skip_if_offline()
kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
q <- kco %>%
corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
fetchNext(maxFetch = 1)
# Skip test if no matches found
skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
q_original <- q
q_with_annotations <- fetchAnnotations(q, verbose = FALSE)
# Check that all original slots are preserved
expect_identical(q_with_annotations@korapConnection, q_original@korapConnection)
expect_identical(q_with_annotations@request, q_original@request)
expect_identical(q_with_annotations@vc, q_original@vc)
expect_identical(q_with_annotations@totalResults, q_original@totalResults)
# collectedMatches should have additional annotation columns
expect_true(nrow(q_with_annotations@collectedMatches) == nrow(q_original@collectedMatches))
expect_true(ncol(q_with_annotations@collectedMatches) > ncol(q_original@collectedMatches))
# Original columns should be preserved
original_cols <- colnames(q_original@collectedMatches)
expect_true(all(original_cols %in% colnames(q_with_annotations@collectedMatches)))
# New annotation columns should be present
expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches))
expect_true("pos" %in% colnames(q_with_annotations@collectedMatches))
})
test_that("fetchAnnotations returns structured left/match/right format", {
skip_if_offline()
kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
q <- kco %>%
corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
fetchNext(maxFetch = 1)
# Skip test if no matches found
skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
q_with_annotations <- fetchAnnotations(q, foundry = "tt", verbose = FALSE)
# Test that structured annotation columns exist
expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches))
expect_true("pos" %in% colnames(q_with_annotations@collectedMatches))
# Test the structure of annotation columns
atokens <- q_with_annotations@collectedMatches$atokens
pos <- q_with_annotations@collectedMatches$pos
expect_true(is.data.frame(atokens))
expect_true(is.data.frame(pos))
expect_true(all(c("left", "match", "right") %in% names(atokens)))
expect_true(all(c("left", "match", "right") %in% names(pos)))
# Test that each component is a list column
expect_true(is.list(atokens$left))
expect_true(is.list(atokens$match))
expect_true(is.list(atokens$right))
expect_true(is.list(pos$left))
expect_true(is.list(pos$match))
expect_true(is.list(pos$right))
# Test that the first match has actual data
if (nrow(q_with_annotations@collectedMatches) > 0) {
# At least one of left/match/right should have content
total_tokens <- length(atokens$left[[1]]) + length(atokens$match[[1]]) + length(atokens$right[[1]])
expect_true(total_tokens > 0)
total_pos <- length(pos$left[[1]]) + length(pos$match[[1]]) + length(pos$right[[1]])
expect_true(total_pos > 0)
# Token count should match POS count
expect_equal(total_tokens, total_pos)
# Match tokens should not be empty (since we found a match)
expect_true(length(atokens$match[[1]]) > 0)
expect_true(length(pos$match[[1]]) > 0)
}
})
test_that("parser covers full span across multiple <mark> blocks", {
# Local, offline test to ensure correct match extraction when multiple
# <mark>…</mark> segments occur within the match span.
xml_snippet <- '<span class="context-left"></span>
<span class="match">
<span title="tt/l:Wir"><span title="tt/p:PPER">Wir</span></span>
<mark>
<span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span>
</mark>
<span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span>
<mark>
<span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span>
<span title="tt/l:Plan"><span title="tt/p:NN">Plan</span></span>
</mark>
</span>
<span class="context-right"></span>'
parsed <- parse_xml_annotations_structured(xml_snippet)
# Left context contains the pre-mark token
expect_equal(parsed$atokens$left, c("Wir"))
# Match should include everything from the first <mark> to the last </mark>,
# including tokens between them
expect_equal(parsed$atokens$match, c("können", "alles", "außer", "Plan"))
# POS and lemma lengths align with tokens in each section
expect_length(parsed$pos$match, length(parsed$atokens$match))
expect_length(parsed$lemma$match, length(parsed$atokens$match))
expect_equal(parsed$pos$match, c("VVFIN", "PIS", "APPR", "NN"))
expect_equal(parsed$lemma$match, c("können", "alles", "außer", "Plan"))
# Right context should be empty in this snippet
expect_length(parsed$atokens$right, 0)
})
test_that("parser keeps tokens separated across &nbsp; between spans", {
xml_snippet <- '<span class="context-left"></span>
<span class="match">
<mark><span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span></mark>&nbsp;<span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span><mark><span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span></mark>
</span>
<span class="context-right"></span>'
parsed <- parse_xml_annotations_structured(xml_snippet)
expect_equal(parsed$atokens$match, c("können", "alles", "außer"))
expect_equal(parsed$pos$match, c("VVFIN", "PIS", "APPR"))
expect_equal(parsed$lemma$match, c("können", "alles", "außer"))
})
test_that("parser keeps tokens separated across punctuation between spans", {
xml_snippet <- '<span class="context-left"></span>
<span class="match">
<mark><span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span></mark>, <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span><mark><span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span></mark>
</span>
<span class="context-right"></span>'
parsed <- parse_xml_annotations_structured(xml_snippet)
expect_equal(parsed$atokens$match, c("können", "alles", "außer"))
expect_equal(parsed$pos$match, c("VVFIN", "PIS", "APPR"))
expect_equal(parsed$lemma$match, c("können", "alles", "außer"))
})
test_that("online: fetchAnnotations aligns pos/lemma with tokens for complex query", {
skip_if_offline()
kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
q <- kco %>%
corpusQuery('[orth="[wW]ir"] können alles [orth="ausser" | orth="außer"] [tt/pos=NN]',
metadataOnly = FALSE,
fields = c("textSigle", "snippet", "tokens")) %>%
fetchNext(maxFetch = 20)
skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for online test")
q2 <- fetchAnnotations(q, foundry = "tt", verbose = FALSE)
# For each match, POS and lemma counts must equal token count in the match span
for (i in seq_len(nrow(q2@collectedMatches))) {
tt <- q2@collectedMatches$tokens$match[[i]]
pp <- q2@collectedMatches$pos$match[[i]]
ll <- q2@collectedMatches$lemma$match[[i]]
expect_equal(length(tt), length(pp))
expect_equal(length(tt), length(ll))
}
})
test_that("fetchAnnotations aligns tokens and annotations across multiple <mark> blocks (stubbed API)", {
# Define a minimal dummy KorAPConnection-like S4 class for offline testing
setClass('DummyKCO', slots = c(apiUrl='character', verbose='logical'))
setMethod('apiCall', 'DummyKCO', function(kco, url, json = TRUE, getHeaders = FALSE, cache = FALSE, timeout = 10) {
list(snippet = '<span class="context-left"></span> <span class="match"> <span title="tt/l:Wir"><span title="tt/p:PPER">Wir</span></span> <mark> <span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span> </mark> <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span> <mark> <span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span> <span title="tt/l:Plan"><span title="tt/p:NN">Plan</span></span> </mark> </span> <span class="context-right"></span>')
})
# Build a minimal KorAPQuery with a dummy connection and a single match row
kco <- new('DummyKCO', apiUrl = 'http://dummy/', verbose = FALSE)
df <- data.frame(textSigle = 'A/B/C', matchStart = 1, matchEnd = 5, matchID = 'match-A/B/C-p1-5', stringsAsFactors = FALSE)
q <- KorAPQuery(korapConnection = kco, collectedMatches = df)
q2 <- fetchAnnotations(q, foundry = 'tt', verbose = FALSE)
# Expect full match span to be covered
expect_equal(q2@collectedMatches$atokens$left[[1]], c('Wir'))
expect_equal(q2@collectedMatches$atokens$match[[1]], c('können','alles','außer','Plan'))
expect_equal(q2@collectedMatches$pos$match[[1]], c('VVFIN','PIS','APPR','NN'))
expect_equal(q2@collectedMatches$lemma$match[[1]], c('können','alles','außer','Plan'))
# Alignment checks
expect_length(q2@collectedMatches$pos$match[[1]], length(q2@collectedMatches$atokens$match[[1]]))
expect_length(q2@collectedMatches$lemma$match[[1]], length(q2@collectedMatches$atokens$match[[1]]))
})
test_that("matchID is preserved in collectedMatches", {
skip_if_offline()
kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
q <- kco %>%
corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
fetchNext(maxFetch = 1)
# Skip test if no matches found
skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
# Check that matchID is present and valid
expect_true("matchID" %in% colnames(q@collectedMatches))
expect_true(all(!is.na(q@collectedMatches$matchID)))
# Verify matchID format (should contain position information)
expect_true(all(grepl("-p\\d+-\\d+", q@collectedMatches$matchID)))
# Verify that matchStart and matchEnd are correctly extracted from matchID
for (i in seq_len(nrow(q@collectedMatches))) {
match_id <- q@collectedMatches$matchID[i]
positions <- gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", match_id)
expected_start <- as.integer(stringr::word(positions, 1))
expected_end <- as.integer(stringr::word(positions, 2)) - 1
expect_equal(q@collectedMatches$matchStart[i], expected_start)
expect_equal(q@collectedMatches$matchEnd[i], expected_end)
}
})
test_that("fetchAnnotations handles morphological annotations with pipe separators", {
skip_if_offline()
kco <- KorAPConnection("https://korap.dnb.de", verbose = FALSE, cache = FALSE, accessToken = NULL)
q <- kco %>%
auth() %>%
corpusQuery("Ameisenplage", metadataOnly = FALSE) %>%
fetchNext(maxFetch = 1)
# Skip test if no matches found
skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
# Test with marmot foundry which provides morphological annotations
q_with_morph <- fetchAnnotations(q, foundry = "marmot", verbose = FALSE)
# Check that morphological annotation columns exist
expect_true("morph" %in% colnames(q_with_morph@collectedMatches))
expect_true("atokens" %in% colnames(q_with_morph@collectedMatches))
# Test the structure of morphological annotation columns
morph <- q_with_morph@collectedMatches$morph
expect_true(is.data.frame(morph))
expect_true(all(c("left", "match", "right") %in% names(morph)))
expect_true(is.list(morph$match))
# Test that morphological features use pipe separators
if (nrow(q_with_morph@collectedMatches) > 0) {
morph_data <- morph$match[[1]]
# Check that we have morphological data
expect_true(length(morph_data) > 0)
# If morphological data exists and is not NA, it should contain pipe separators
# for multiple features (e.g., "case:acc|gender:fem|number:sg")
if (!is.na(morph_data[1]) && nchar(morph_data[1]) > 0) {
# Should contain morphological features separated by pipes
expect_true(grepl("^[^|]+", morph_data[1])) # At least one feature
# If multiple features exist, they should be pipe-separated
if (grepl("\\|", morph_data[1])) {
features <- unlist(strsplit(morph_data[1], "\\|"))
expect_true(length(features) > 1)
# Each feature should follow the pattern "attribute:value"
expect_true(all(grepl("^[^:]+:[^:]+$", features)))
}
}
}
})