Parse annotations into lists of vectors

Change-Id: I9ee3a5c10ff997380b9661dd603f8ec83c6a91a8
diff --git a/tests/testthat/test-fetchAnnotations.R b/tests/testthat/test-fetchAnnotations.R
new file mode 100644
index 0000000..46f604f
--- /dev/null
+++ b/tests/testthat/test-fetchAnnotations.R
@@ -0,0 +1,179 @@
+test_that("fetchAnnotations works with valid matches", {
+  skip_if_offline()
+
+  kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
+  q <- kco %>%
+    corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
+    fetchNext(maxFetch = 2)
+
+  # Skip test if no matches found
+  skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
+
+  # Test that structured annotation columns are initially absent
+  expect_false("atokens" %in% colnames(q@collectedMatches))
+  expect_false("pos" %in% colnames(q@collectedMatches))
+
+  # Test that matchID is preserved in collectedMatches
+  expect_true("matchID" %in% colnames(q@collectedMatches))
+  expect_true(all(!is.na(q@collectedMatches$matchID)))
+
+  # Test fetchAnnotations with default foundry
+  q_with_annotations <- fetchAnnotations(q, verbose = FALSE)
+
+  # Check that structured annotation columns are now populated
+  expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches))
+  expect_true("pos" %in% colnames(q_with_annotations@collectedMatches))
+
+  # Check that the structured columns have left/match/right components
+  expect_true(all(c("left", "match", "right") %in% names(q_with_annotations@collectedMatches$atokens)))
+  expect_true(all(c("left", "match", "right") %in% names(q_with_annotations@collectedMatches$pos)))
+
+  # Test fetchAnnotations with specific foundry
+  q_with_tt <- fetchAnnotations(q, foundry = "tt", verbose = FALSE)
+  expect_true("atokens" %in% colnames(q_with_tt@collectedMatches))
+  expect_true("pos" %in% colnames(q_with_tt@collectedMatches))
+
+  # Test that annotations contain actual content (regression test for URL construction)
+  if (nrow(q_with_tt@collectedMatches) > 0) {
+    # Check that the first match has populated annotation data
+    expect_true(length(q_with_tt@collectedMatches$atokens$left[[1]]) > 0 ||
+      length(q_with_tt@collectedMatches$atokens$match[[1]]) > 0 ||
+      length(q_with_tt@collectedMatches$atokens$right[[1]]) > 0)
+    expect_true(length(q_with_tt@collectedMatches$pos$left[[1]]) > 0 ||
+      length(q_with_tt@collectedMatches$pos$match[[1]]) > 0 ||
+      length(q_with_tt@collectedMatches$pos$right[[1]]) > 0)
+  }
+})
+
+test_that("fetchAnnotations handles empty matches gracefully", {
+  kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
+
+  # Create a query object with no collected matches
+  q <- KorAPQuery(
+    korapConnection = kco,
+    collectedMatches = NULL
+  )
+
+  # Should warn and return original object
+  expect_warning(
+    result <- fetchAnnotations(q, verbose = FALSE),
+    "No collected matches found"
+  )
+  expect_identical(result, q)
+})
+
+test_that("fetchAnnotations preserves original object structure", {
+  skip_if_offline()
+
+  kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
+  q <- kco %>%
+    corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
+    fetchNext(maxFetch = 1)
+
+  # Skip test if no matches found
+  skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
+
+  q_original <- q
+  q_with_annotations <- fetchAnnotations(q, verbose = FALSE)
+
+  # Check that all original slots are preserved
+  expect_identical(q_with_annotations@korapConnection, q_original@korapConnection)
+  expect_identical(q_with_annotations@request, q_original@request)
+  expect_identical(q_with_annotations@vc, q_original@vc)
+  expect_identical(q_with_annotations@totalResults, q_original@totalResults)
+
+  # collectedMatches should have additional annotation columns
+  expect_true(nrow(q_with_annotations@collectedMatches) == nrow(q_original@collectedMatches))
+  expect_true(ncol(q_with_annotations@collectedMatches) > ncol(q_original@collectedMatches))
+
+  # Original columns should be preserved
+  original_cols <- colnames(q_original@collectedMatches)
+  expect_true(all(original_cols %in% colnames(q_with_annotations@collectedMatches)))
+
+  # New annotation columns should be present
+  expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches))
+  expect_true("pos" %in% colnames(q_with_annotations@collectedMatches))
+})
+
+test_that("fetchAnnotations returns structured left/match/right format", {
+  skip_if_offline()
+
+  kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
+  q <- kco %>%
+    corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
+    fetchNext(maxFetch = 1)
+
+  # Skip test if no matches found
+  skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
+
+  q_with_annotations <- fetchAnnotations(q, foundry = "tt", verbose = FALSE)
+
+  # Test that structured annotation columns exist
+  expect_true("atokens" %in% colnames(q_with_annotations@collectedMatches))
+  expect_true("pos" %in% colnames(q_with_annotations@collectedMatches))
+
+  # Test the structure of annotation columns
+  atokens <- q_with_annotations@collectedMatches$atokens
+  pos <- q_with_annotations@collectedMatches$pos
+
+  expect_true(is.data.frame(atokens))
+  expect_true(is.data.frame(pos))
+
+  expect_true(all(c("left", "match", "right") %in% names(atokens)))
+  expect_true(all(c("left", "match", "right") %in% names(pos)))
+
+  # Test that each component is a list column
+  expect_true(is.list(atokens$left))
+  expect_true(is.list(atokens$match))
+  expect_true(is.list(atokens$right))
+  expect_true(is.list(pos$left))
+  expect_true(is.list(pos$match))
+  expect_true(is.list(pos$right))
+
+  # Test that the first match has actual data
+  if (nrow(q_with_annotations@collectedMatches) > 0) {
+    # At least one of left/match/right should have content
+    total_tokens <- length(atokens$left[[1]]) + length(atokens$match[[1]]) + length(atokens$right[[1]])
+    expect_true(total_tokens > 0)
+
+    total_pos <- length(pos$left[[1]]) + length(pos$match[[1]]) + length(pos$right[[1]])
+    expect_true(total_pos > 0)
+
+    # Token count should match POS count
+    expect_equal(total_tokens, total_pos)
+
+    # Match tokens should not be empty (since we found a match)
+    expect_true(length(atokens$match[[1]]) > 0)
+    expect_true(length(pos$match[[1]]) > 0)
+  }
+})
+
+test_that("matchID is preserved in collectedMatches", {
+  skip_if_offline()
+
+  kco <- KorAPConnection(verbose = FALSE, cache = FALSE, accessToken = NULL)
+  q <- kco %>%
+    corpusQuery("Test", "pubDate since 2014", metadataOnly = FALSE, fields = c("textSigle", "snippet")) %>%
+    fetchNext(maxFetch = 1)
+
+  # Skip test if no matches found
+  skip_if(is.null(q@collectedMatches) || nrow(q@collectedMatches) == 0, "No matches found for test query")
+
+  # Check that matchID is present and valid
+  expect_true("matchID" %in% colnames(q@collectedMatches))
+  expect_true(all(!is.na(q@collectedMatches$matchID)))
+
+  # Verify matchID format (should contain position information)
+  expect_true(all(grepl("-p\\d+-\\d+", q@collectedMatches$matchID)))
+
+  # Verify that matchStart and matchEnd are correctly extracted from matchID
+  for (i in seq_len(nrow(q@collectedMatches))) {
+    match_id <- q@collectedMatches$matchID[i]
+    positions <- gsub(".*-p(\\d+)-(\\d+).*", "\\1 \\2", match_id)
+    expected_start <- as.integer(stringr::word(positions, 1))
+    expected_end <- as.integer(stringr::word(positions, 2)) - 1
+
+    expect_equal(q@collectedMatches$matchStart[i], expected_start)
+    expect_equal(q@collectedMatches$matchEnd[i], expected_end)
+  }
+})