| test_that("fetchAnnotations works with valid matches", { |
| skip_if_offline() |
| |
| kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL) |
| q <- kco %>% |
| corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>% |
| fetchNext(maxFetch = 2) |
| |
| # Skip test if no matches found |
| skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query") |
| |
| # Test that structured annotation columns are initially absent |
| expect_false("atokens" %in% colnames(q@collectedMatches)) |
| expect_false("pos" %in% colnames(q@collectedMatches)) |
| |
| # Test that matchID is preserved in collectedMatches |
| expect_true("matchID" %in% colnames(q@collectedMatches)) |
| expect_true(all(!is.na(q@collectedMatches$matchID))) |
| |
| # Test fetchAnnotations with default foundry |
| q_with_annotations <- fetchAnnotations(q, verbose = FALSE) |
| |
| # Check that structured annotation columns are now populated |
| expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches)) |
| expect_true("pos" %in% colnames(q_with_annotations@collectedMatches)) |
| |
| # Check that the structured columns have left/match/right components |
| expect_true(all(c("left", "match", "right") %in% names(q_with_annotations@collectedMatches$atokens))) |
| expect_true(all(c("left", "match", "right") %in% names(q_with_annotations@collectedMatches$pos))) |
| |
| # Test fetchAnnotations with specific foundry |
| q_with_tt <- fetchAnnotations(q, foundry = "tt", verbose = FALSE) |
| expect_true("atokens" %in% colnames(q_with_tt@collectedMatches)) |
| expect_true("pos" %in% colnames(q_with_tt@collectedMatches)) |
| |
| # Test that annotations contain actual content (regression test for URL construction) |
| if (nrow(q_with_tt@collectedMatches) > 0) { |
| # Check that the first match has populated annotation data |
| expect_true(length(q_with_tt@collectedMatches$atokens$left[[1]]) > 0 || |
| length(q_with_tt@collectedMatches$atokens$match[[1]]) > 0 || |
| length(q_with_tt@collectedMatches$atokens$right[[1]]) > 0) |
| expect_true(length(q_with_tt@collectedMatches$pos$left[[1]]) > 0 || |
| length(q_with_tt@collectedMatches$pos$match[[1]]) > 0 || |
| length(q_with_tt@collectedMatches$pos$right[[1]]) > 0) |
| } |
| }) |
| |
| test_that("fetchAnnotations handles empty matches gracefully", { |
| kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL) |
| |
| # Create a query object with no collected matches |
| q <- KorAPQuery( |
| korapConnection = kco, |
| collectedMatches = NULL |
| ) |
| |
| # Should warn and return original object |
| expect_warning( |
| result <- fetchAnnotations(q, verbose = FALSE), |
| "No collected matches found" |
| ) |
| expect_identical(result, q) |
| }) |
| |
| test_that("fetchAnnotations preserves original object structure", { |
| skip_if_offline() |
| |
| kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL) |
| q <- kco %>% |
| corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>% |
| fetchNext(maxFetch = 1) |
| |
| # Skip test if no matches found |
| skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query") |
| |
| q_original <- q |
| q_with_annotations <- fetchAnnotations(q, verbose = FALSE) |
| |
| # Check that all original slots are preserved |
| expect_identical(q_with_annotations@korapConnection, q_original@korapConnection) |
| expect_identical(q_with_annotations@request, q_original@request) |
| expect_identical(q_with_annotations@vc, q_original@vc) |
| expect_identical(q_with_annotations@totalResults, q_original@totalResults) |
| |
| # collectedMatches should have additional annotation columns |
| expect_true(nrow(q_with_annotations@collectedMatches) == nrow(q_original@collectedMatches)) |
| expect_true(ncol(q_with_annotations@collectedMatches) > ncol(q_original@collectedMatches)) |
| |
| # Original columns should be preserved |
| original_cols <- colnames(q_original@collectedMatches) |
| expect_true(all(original_cols %in% colnames(q_with_annotations@collectedMatches))) |
| |
| # New annotation columns should be present |
| expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches)) |
| expect_true("pos" %in% colnames(q_with_annotations@collectedMatches)) |
| }) |
| |
| test_that("fetchAnnotations returns structured left/match/right format", { |
| skip_if_offline() |
| |
| kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL) |
| q <- kco %>% |
| corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>% |
| fetchNext(maxFetch = 1) |
| |
| # Skip test if no matches found |
| skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query") |
| |
| q_with_annotations <- fetchAnnotations(q, foundry = "tt", verbose = FALSE) |
| |
| # Test that structured annotation columns exist |
| expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches)) |
| expect_true("pos" %in% colnames(q_with_annotations@collectedMatches)) |
| |
| # Test the structure of annotation columns |
| atokens <- q_with_annotations@collectedMatches$atokens |
| pos <- q_with_annotations@collectedMatches$pos |
| |
| expect_true(is.data.frame(atokens)) |
| expect_true(is.data.frame(pos)) |
| |
| expect_true(all(c("left", "match", "right") %in% names(atokens))) |
| expect_true(all(c("left", "match", "right") %in% names(pos))) |
| |
| # Test that each component is a list column |
| expect_true(is.list(atokens$left)) |
| expect_true(is.list(atokens$match)) |
| expect_true(is.list(atokens$right)) |
| expect_true(is.list(pos$left)) |
| expect_true(is.list(pos$match)) |
| expect_true(is.list(pos$right)) |
| |
| # Test that the first match has actual data |
| if (nrow(q_with_annotations@collectedMatches) > 0) { |
| # At least one of left/match/right should have content |
| total_tokens <- length(atokens$left[[1]]) + length(atokens$match[[1]]) + length(atokens$right[[1]]) |
| expect_true(total_tokens > 0) |
| |
| total_pos <- length(pos$left[[1]]) + length(pos$match[[1]]) + length(pos$right[[1]]) |
| expect_true(total_pos > 0) |
| |
| # Token count should match POS count |
| expect_equal(total_tokens, total_pos) |
| |
| # Match tokens should not be empty (since we found a match) |
| expect_true(length(atokens$match[[1]]) > 0) |
| expect_true(length(pos$match[[1]]) > 0) |
| } |
| }) |
| |
| test_that("parser covers full span across multiple <mark> blocks", { |
| # Local, offline test to ensure correct match extraction when multiple |
| # <mark>…</mark> segments occur within the match span. |
| xml_snippet <- '<span class="context-left"></span> |
| <span class="match"> |
| <span title="tt/l:Wir"><span title="tt/p:PPER">Wir</span></span> |
| <mark> |
| <span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span> |
| </mark> |
| <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span> |
| <mark> |
| <span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span> |
| <span title="tt/l:Plan"><span title="tt/p:NN">Plan</span></span> |
| </mark> |
| </span> |
| <span class="context-right"></span>' |
| |
| parsed <- parse_xml_annotations_structured(xml_snippet) |
| |
| # Left context contains the pre-mark token |
| expect_equal(parsed$atokens$left, c("Wir")) |
| |
| # Match should include everything from the first <mark> to the last </mark>, |
| # including tokens between them |
| expect_equal(parsed$atokens$match, c("können", "alles", "außer", "Plan")) |
| |
| # POS and lemma lengths align with tokens in each section |
| expect_length(parsed$pos$match, length(parsed$atokens$match)) |
| expect_length(parsed$lemma$match, length(parsed$atokens$match)) |
| expect_equal(parsed$pos$match, c("VVFIN", "PIS", "APPR", "NN")) |
| expect_equal(parsed$lemma$match, c("können", "alles", "außer", "Plan")) |
| |
| # Right context should be empty in this snippet |
| expect_length(parsed$atokens$right, 0) |
| }) |
| |
| test_that("parser keeps tokens separated across between spans", { |
| xml_snippet <- '<span class="context-left"></span> |
| <span class="match"> |
| <mark><span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span></mark> <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span><mark><span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span></mark> |
| </span> |
| <span class="context-right"></span>' |
| |
| parsed <- parse_xml_annotations_structured(xml_snippet) |
| expect_equal(parsed$atokens$match, c("können", "alles", "außer")) |
| expect_equal(parsed$pos$match, c("VVFIN", "PIS", "APPR")) |
| expect_equal(parsed$lemma$match, c("können", "alles", "außer")) |
| }) |
| |
| test_that("parser keeps tokens separated across punctuation between spans", { |
| xml_snippet <- '<span class="context-left"></span> |
| <span class="match"> |
| <mark><span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span></mark>, <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span><mark><span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span></mark> |
| </span> |
| <span class="context-right"></span>' |
| |
| parsed <- parse_xml_annotations_structured(xml_snippet) |
| expect_equal(parsed$atokens$match, c("können", "alles", "außer")) |
| expect_equal(parsed$pos$match, c("VVFIN", "PIS", "APPR")) |
| expect_equal(parsed$lemma$match, c("können", "alles", "außer")) |
| }) |
| |
| test_that("online: fetchAnnotations aligns pos/lemma with tokens for complex query", { |
| skip_if_offline() |
| |
| kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL) |
| q <- kco %>% |
| corpusQuery('[orth="[wW]ir"] können alles [orth="ausser" | orth="außer"] [tt/pos=NN]', |
| metadataOnly = FALSE, |
| fields = c("textSigle", "snippet", "tokens")) %>% |
| fetchNext(maxFetch = 20) |
| |
| skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for online test") |
| |
| q2 <- fetchAnnotations(q, foundry = "tt", verbose = FALSE) |
| |
| # For each match, POS and lemma counts must equal token count in the match span |
| for (i in seq_len(nrow(q2@collectedMatches))) { |
| tt <- q2@collectedMatches$tokens$match[[i]] |
| pp <- q2@collectedMatches$pos$match[[i]] |
| ll <- q2@collectedMatches$lemma$match[[i]] |
| expect_equal(length(tt), length(pp)) |
| expect_equal(length(tt), length(ll)) |
| } |
| }) |
| |
| test_that("fetchAnnotations aligns tokens and annotations across multiple <mark> blocks (stubbed API)", { |
| # Define a minimal dummy KorAPConnection-like S4 class for offline testing |
| setClass('DummyKCO', slots = c(apiUrl='character', verbose='logical')) |
| setMethod('apiCall', 'DummyKCO', function(kco, url, json = TRUE, getHeaders = FALSE, cache = FALSE, timeout = 10) { |
| list(snippet = '<span class="context-left"></span> <span class="match"> <span title="tt/l:Wir"><span title="tt/p:PPER">Wir</span></span> <mark> <span title="tt/l:können"><span title="tt/p:VVFIN">können</span></span> </mark> <span title="tt/l:alles"><span title="tt/p:PIS">alles</span></span> <mark> <span title="tt/l:außer"><span title="tt/p:APPR">außer</span></span> <span title="tt/l:Plan"><span title="tt/p:NN">Plan</span></span> </mark> </span> <span class="context-right"></span>') |
| }) |
| |
| # Build a minimal KorAPQuery with a dummy connection and a single match row |
| kco <- new('DummyKCO', apiUrl = 'http://dummy/', verbose = FALSE) |
| df <- data.frame(textSigle = 'A/B/C', matchStart = 1, matchEnd = 5, matchID = 'match-A/B/C-p1-5', stringsAsFactors = FALSE) |
| q <- KorAPQuery(korapConnection = kco, collectedMatches = df) |
| |
| q2 <- fetchAnnotations(q, foundry = 'tt', verbose = FALSE) |
| |
| # Expect full match span to be covered |
| expect_equal(q2@collectedMatches$atokens$left[[1]], c('Wir')) |
| expect_equal(q2@collectedMatches$atokens$match[[1]], c('können','alles','außer','Plan')) |
| expect_equal(q2@collectedMatches$pos$match[[1]], c('VVFIN','PIS','APPR','NN')) |
| expect_equal(q2@collectedMatches$lemma$match[[1]], c('können','alles','außer','Plan')) |
| |
| # Alignment checks |
| expect_length(q2@collectedMatches$pos$match[[1]], length(q2@collectedMatches$atokens$match[[1]])) |
| expect_length(q2@collectedMatches$lemma$match[[1]], length(q2@collectedMatches$atokens$match[[1]])) |
| }) |
| |
| test_that("matchID is preserved in collectedMatches", { |
| skip_if_offline() |
| |
| kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL) |
| q <- kco %>% |
| corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>% |
| fetchNext(maxFetch = 1) |
| |
| # Skip test if no matches found |
| skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query") |
| |
| # Check that matchID is present and valid |
| expect_true("matchID" %in% colnames(q@collectedMatches)) |
| expect_true(all(!is.na(q@collectedMatches$matchID))) |
| |
| # Verify matchID format (should contain position information) |
| expect_true(all(grepl("-p\\d+-\\d+", q@collectedMatches$matchID))) |
| |
| # Verify that matchStart and matchEnd are correctly extracted from matchID |
| for (i in seq_len(nrow(q@collectedMatches))) { |
| match_id <- q@collectedMatches$matchID[i] |
| positions <- gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", match_id) |
| expected_start <- as.integer(stringr::word(positions, 1)) |
| expected_end <- as.integer(stringr::word(positions, 2)) - 1 |
| |
| expect_equal(q@collectedMatches$matchStart[i], expected_start) |
| expect_equal(q@collectedMatches$matchEnd[i], expected_end) |
| } |
| }) |
| |
| test_that("fetchAnnotations handles morphological annotations with pipe separators", { |
| skip_if_offline() |
| |
| kco <- KorAPConnection("https://korap.dnb.de", verbose = FALSE, cache = FALSE, accessToken = NULL) |
| q <- kco %>% |
| auth() %>% |
| corpusQuery("Ameisenplage", metadataOnly = FALSE) %>% |
| fetchNext(maxFetch = 1) |
| |
| # Skip test if no matches found |
| skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query") |
| |
| # Test with marmot foundry which provides morphological annotations |
| q_with_morph <- fetchAnnotations(q, foundry = "marmot", verbose = FALSE) |
| |
| # Check that morphological annotation columns exist |
| expect_true("morph" %in% colnames(q_with_morph@collectedMatches)) |
| expect_true("atokens" %in% colnames(q_with_morph@collectedMatches)) |
| |
| # Test the structure of morphological annotation columns |
| morph <- q_with_morph@collectedMatches$morph |
| expect_true(is.data.frame(morph)) |
| expect_true(all(c("left", "match", "right") %in% names(morph))) |
| expect_true(is.list(morph$match)) |
| |
| # Test that morphological features use pipe separators |
| if (nrow(q_with_morph@collectedMatches) > 0) { |
| morph_data <- morph$match[[1]] |
| |
| # Check that we have morphological data |
| expect_true(length(morph_data) > 0) |
| |
| # If morphological data exists and is not NA, it should contain pipe separators |
| # for multiple features (e.g., "case:acc|gender:fem|number:sg") |
| if (!is.na(morph_data[1]) && nchar(morph_data[1]) > 0) { |
| # Should contain morphological features separated by pipes |
| expect_true(grepl("^[^|]+", morph_data[1])) # At least one feature |
| |
| # If multiple features exist, they should be pipe-separated |
| if (grepl("\\|", morph_data[1])) { |
| features <- unlist(strsplit(morph_data[1], "\\|")) |
| expect_true(length(features) > 1) |
| # Each feature should follow the pattern "attribute:value" |
| expect_true(all(grepl("^[^:]+:[^:]+$", features))) |
| } |
| } |
| } |
| }) |