Add overwrite parameter to fetchAnnotations Change-Id: I395dbcb44a18e04679232bdc65b4e2912836e4fa

commit: 93787d52e349fae2d6b1a9e2a2ca07bf1ad5dd60 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Wed Sep 03 13:33:25 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Wed Sep 03 13:33:25 2025 +0200
tree: d42eabe0fd2b40c13e21cc31cebae12c301315d4
parent: 560b591ea7e4acefff9892d2a1f10cda0d139340 [diff]
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index ad867c5..7e4d9cd 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R

@@ -1118,6 +1118,10 @@
 #'
 #' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
 #' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
+#' @param overwrite logical; if TRUE, re-fetch and replace any existing
+#'   annotation columns. If FALSE (default), only add missing annotation layers
+#'   and preserve already fetched ones (e.g., keep POS/lemma from a previous
+#'   foundry while adding morph from another).
 #' @param verbose print progress information if true
 #' @return The updated `kqo` object with annotation columns 
 #' like `pos`, `lemma`, `morph` (and `atokens` and `annotation_snippet`)
@@ -1158,7 +1162,7 @@
 #' q@collectedMatches$pos$left[1] # POS tags for the left context of the first match
 #' }
 #' @export
-setMethod("fetchAnnotations", "KorAPQuery", function(kqo, foundry = "tt", verbose = kqo@korapConnection@verbose) {
+setMethod("fetchAnnotations", "KorAPQuery", function(kqo, foundry = "tt", overwrite = FALSE, verbose = kqo@korapConnection@verbose) {
   if (is.null(kqo@collectedMatches) || nrow(kqo@collectedMatches) == 0) {
     warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
     return(kqo)
@@ -1184,13 +1188,26 @@
     )
   }
 
-  # Initialize all annotation columns using the helper function
+  # Track which annotation columns already existed to decide overwrite behavior
+  existing_types <- list(
+    pos = "pos" %in% colnames(df),
+    lemma = "lemma" %in% colnames(df),
+    morph = "morph" %in% colnames(df),
+    atokens = "atokens" %in% colnames(df),
+    annotation_snippet = "annotation_snippet" %in% colnames(df)
+  )
+
+  # Initialize annotation columns using the helper function
   annotation_types <- c("pos", "lemma", "morph", "atokens")
   for (type in annotation_types) {
-    df[[type]] <- create_annotation_df(empty_char_list)
+    if (overwrite || !existing_types[[type]]) {
+      df[[type]] <- create_annotation_df(empty_char_list)
+    }
   }
 
-  df$annotation_snippet <- replicate(nrows, NA, simplify = FALSE)
+  if (overwrite || !existing_types$annotation_snippet) {
+    df$annotation_snippet <- replicate(nrows, NA, simplify = FALSE)
+  }
 
   # Initialize timing for ETA calculation
   start_time <- Sys.time()
@@ -1198,6 +1215,19 @@
     log_info(verbose, paste("Starting to fetch annotations for", nrows, "matches\n"))
   }
 
+  # Helper to decide if existing annotation row is effectively empty
+  is_empty_annotation_row <- function(ann_df, row_index) {
+    if (is.null(ann_df) || nrow(ann_df) < row_index) return(TRUE)
+    left_val <- ann_df$left[[row_index]]
+    match_val <- ann_df$match[[row_index]]
+    right_val <- ann_df$right[[row_index]]
+    all(
+      (is.null(left_val) || (length(left_val) == 0) || all(is.na(left_val))),
+      (is.null(match_val) || (length(match_val) == 0) || all(is.na(match_val))),
+      (is.null(right_val) || (length(right_val) == 0) || all(is.na(right_val)))
+    )
+  }
+
   for (i in seq_len(nrow(df))) {
     # ETA logging
     if (verbose && i > 1) {
@@ -1244,8 +1274,10 @@
       res <- apiCall(kco, req)
 
       if (!is.null(res)) {
-        # Store the raw annotation snippet
-        df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
+        # Store the raw annotation snippet (respect overwrite flag)
+        if (overwrite || !existing_types$annotation_snippet || is.null(df$annotation_snippet[[i]]) || is.na(df$annotation_snippet[[i]])) {
+          df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
+        }
 
         # Parse XML annotations if snippet is available
         if (is.list(res) && "snippet" %in% names(res)) {
@@ -1255,97 +1287,141 @@
           # Use individual assignment to avoid data frame mismatch errors
           tryCatch({
             # Assign POS annotations
-            df$pos$left[i] <- list(parsed_annotations$pos$left)
-            df$pos$match[i] <- list(parsed_annotations$pos$match)
-            df$pos$right[i] <- list(parsed_annotations$pos$right)
+            if (overwrite || !existing_types$pos || is_empty_annotation_row(df$pos, i)) {
+              df$pos$left[i] <- list(parsed_annotations$pos$left)
+              df$pos$match[i] <- list(parsed_annotations$pos$match)
+              df$pos$right[i] <- list(parsed_annotations$pos$right)
+            }
 
             # Assign lemma annotations
-            df$lemma$left[i] <- list(parsed_annotations$lemma$left)
-            df$lemma$match[i] <- list(parsed_annotations$lemma$match)
-            df$lemma$right[i] <- list(parsed_annotations$lemma$right)
+            if (overwrite || !existing_types$lemma || is_empty_annotation_row(df$lemma, i)) {
+              df$lemma$left[i] <- list(parsed_annotations$lemma$left)
+              df$lemma$match[i] <- list(parsed_annotations$lemma$match)
+              df$lemma$right[i] <- list(parsed_annotations$lemma$right)
+            }
 
             # Assign morphology annotations
-            df$morph$left[i] <- list(parsed_annotations$morph$left)
-            df$morph$match[i] <- list(parsed_annotations$morph$match)
-            df$morph$right[i] <- list(parsed_annotations$morph$right)
+            if (overwrite || !existing_types$morph || is_empty_annotation_row(df$morph, i)) {
+              df$morph$left[i] <- list(parsed_annotations$morph$left)
+              df$morph$match[i] <- list(parsed_annotations$morph$match)
+              df$morph$right[i] <- list(parsed_annotations$morph$right)
+            }
 
             # Assign token annotations
-            df$atokens$left[i] <- list(parsed_annotations$atokens$left)
-            df$atokens$match[i] <- list(parsed_annotations$atokens$match)
-            df$atokens$right[i] <- list(parsed_annotations$atokens$right)
+            if (overwrite || !existing_types$atokens || is_empty_annotation_row(df$atokens, i)) {
+              df$atokens$left[i] <- list(parsed_annotations$atokens$left)
+              df$atokens$match[i] <- list(parsed_annotations$atokens$match)
+              df$atokens$right[i] <- list(parsed_annotations$atokens$right)
+            }
           }, error = function(assign_error) {
             # Set empty character vectors on assignment error using list assignment
-            df$pos$left[i] <<- list(character(0))
-            df$pos$match[i] <<- list(character(0))
-            df$pos$right[i] <<- list(character(0))
+            if (overwrite || !existing_types$pos) {
+              df$pos$left[i] <<- list(character(0))
+              df$pos$match[i] <<- list(character(0))
+              df$pos$right[i] <<- list(character(0))
+            }
 
-            df$lemma$left[i] <<- list(character(0))
-            df$lemma$match[i] <<- list(character(0))
-            df$lemma$right[i] <<- list(character(0))
+            if (overwrite || !existing_types$lemma) {
+              df$lemma$left[i] <<- list(character(0))
+              df$lemma$match[i] <<- list(character(0))
+              df$lemma$right[i] <<- list(character(0))
+            }
 
-            df$morph$left[i] <<- list(character(0))
-            df$morph$match[i] <<- list(character(0))
-            df$morph$right[i] <<- list(character(0))
+            if (overwrite || !existing_types$morph) {
+              df$morph$left[i] <<- list(character(0))
+              df$morph$match[i] <<- list(character(0))
+              df$morph$right[i] <<- list(character(0))
+            }
 
-            df$atokens$left[i] <<- list(character(0))
-            df$atokens$match[i] <<- list(character(0))
-            df$atokens$right[i] <<- list(character(0))
+            if (overwrite || !existing_types$atokens) {
+              df$atokens$left[i] <<- list(character(0))
+              df$atokens$match[i] <<- list(character(0))
+              df$atokens$right[i] <<- list(character(0))
+            }
           })
         } else {
           # No snippet available, store empty vectors
-          df$pos$left[i] <- list(character(0))
-          df$pos$match[i] <- list(character(0))
-          df$pos$right[i] <- list(character(0))
+          if (overwrite || !existing_types$pos) {
+            df$pos$left[i] <- list(character(0))
+            df$pos$match[i] <- list(character(0))
+            df$pos$right[i] <- list(character(0))
+          }
 
-          df$lemma$left[i] <- list(character(0))
-          df$lemma$match[i] <- list(character(0))
-          df$lemma$right[i] <- list(character(0))
+          if (overwrite || !existing_types$lemma) {
+            df$lemma$left[i] <- list(character(0))
+            df$lemma$match[i] <- list(character(0))
+            df$lemma$right[i] <- list(character(0))
+          }
 
-          df$morph$left[i] <- list(character(0))
-          df$morph$match[i] <- list(character(0))
-          df$morph$right[i] <- list(character(0))
+          if (overwrite || !existing_types$morph) {
+            df$morph$left[i] <- list(character(0))
+            df$morph$match[i] <- list(character(0))
+            df$morph$right[i] <- list(character(0))
+          }
 
-          df$atokens$left[i] <- list(character(0))
-          df$atokens$match[i] <- list(character(0))
-          df$atokens$right[i] <- list(character(0))
+          if (overwrite || !existing_types$atokens) {
+            df$atokens$left[i] <- list(character(0))
+            df$atokens$match[i] <- list(character(0))
+            df$atokens$right[i] <- list(character(0))
+          }
         }
       } else {
         # Store NAs for failed requests
-        df$pos$left[i] <- list(NA)
-        df$pos$match[i] <- list(NA)
-        df$pos$right[i] <- list(NA)
+        if (overwrite || !existing_types$pos) {
+          df$pos$left[i] <- list(NA)
+          df$pos$match[i] <- list(NA)
+          df$pos$right[i] <- list(NA)
+        }
 
-        df$lemma$left[i] <- list(NA)
-        df$lemma$match[i] <- list(NA)
-        df$lemma$right[i] <- list(NA)
+        if (overwrite || !existing_types$lemma) {
+          df$lemma$left[i] <- list(NA)
+          df$lemma$match[i] <- list(NA)
+          df$lemma$right[i] <- list(NA)
+        }
 
-        df$morph$left[i] <- list(NA)
-        df$morph$match[i] <- list(NA)
-        df$morph$right[i] <- list(NA)
+        if (overwrite || !existing_types$morph) {
+          df$morph$left[i] <- list(NA)
+          df$morph$match[i] <- list(NA)
+          df$morph$right[i] <- list(NA)
+        }
 
-        df$atokens$left[i] <- list(NA)
-        df$atokens$match[i] <- list(NA)
-        df$atokens$right[i] <- list(NA)
-        df$annotation_snippet[[i]] <- NA
+        if (overwrite || !existing_types$atokens) {
+          df$atokens$left[i] <- list(NA)
+          df$atokens$match[i] <- list(NA)
+          df$atokens$right[i] <- list(NA)
+        }
+        if (overwrite || !existing_types$annotation_snippet) {
+          df$annotation_snippet[[i]] <- NA
+        }
       }
     }, error = function(e) {
       # Store NAs for failed requests
-      df$pos$left[i] <- list(NA)
-      df$pos$match[i] <- list(NA)
-      df$pos$right[i] <- list(NA)
+      if (overwrite || !existing_types$pos) {
+        df$pos$left[i] <- list(NA)
+        df$pos$match[i] <- list(NA)
+        df$pos$right[i] <- list(NA)
+      }
 
-      df$lemma$left[i] <- list(NA)
-      df$lemma$match[i] <- list(NA)
-      df$lemma$right[i] <- list(NA)
+      if (overwrite || !existing_types$lemma) {
+        df$lemma$left[i] <- list(NA)
+        df$lemma$match[i] <- list(NA)
+        df$lemma$right[i] <- list(NA)
+      }
 
-      df$morph$left[i] <- list(NA)
-      df$morph$match[i] <- list(NA)
-      df$morph$right[i] <- list(NA)
+      if (overwrite || !existing_types$morph) {
+        df$morph$left[i] <- list(NA)
+        df$morph$match[i] <- list(NA)
+        df$morph$right[i] <- list(NA)
+      }
 
-      df$atokens$left[i] <- list(NA)
-      df$atokens$match[i] <- list(NA)
-      df$atokens$right[i] <- list(NA)
-      df$annotation_snippet[[i]] <- NA
+      if (overwrite || !existing_types$atokens) {
+        df$atokens$left[i] <- list(NA)
+        df$atokens$match[i] <- list(NA)
+        df$atokens$right[i] <- list(NA)
+      }
+      if (overwrite || !existing_types$annotation_snippet) {
+        df$annotation_snippet[[i]] <- NA
+      }
     })
   }
 

diff --git a/Readme.md b/Readme.md
index 83c3120..fe4835f 100644
--- a/Readme.md
+++ b/Readme.md

@@ -197,6 +197,19 @@
 
 The annotations are stored in `q@collectedMatches$pos`, `q@collectedMatches$morph`, and `q@collectedMatches$lemma` (for foundries that contain lemma annotations, like `tt`, but not `marmot`).
 
+Small workflows you may find useful:
+
+```r
+# 1) Add TT (POS/lemma), then add MarMoT (morph) without overwriting
+q <- corpusQuery(kco, "Ameisenplage", metadataOnly = FALSE) |>
+  fetchAll() |>
+  fetchAnnotations(foundry = "tt") |>
+  fetchAnnotations(foundry = "marmot")  # keeps TT POS/lemma, adds morph
+
+# 2) Force re-fetch to repair damaged annotations
+q <- fetchAnnotations(q, foundry = "tt", overwrite = TRUE)
+```
+
 **Tip**: If you don't know any of the provided query languages or the tag sets, you can use KorAP's *query by example* (or rather *query by match*) feature by searching for a concrete example of the construction you are interested in, and then constructing your complex annotation query by just clicking on the entries in the tokens annotations of the query results, as demonstrated in this [video](https://corpora.ids-mannheim.de/slides/2024-04-24-Current-Challenges/autant-que-je-sache.mp4) (see also Diewald/Barbu Mititelu/Kupietz 2019).
 
 ## Demos

diff --git a/man/fetchAnnotations-KorAPQuery-method.Rd b/man/fetchAnnotations-KorAPQuery-method.Rd
index 5ae4a24..bf75599 100644
--- a/man/fetchAnnotations-KorAPQuery-method.Rd
+++ b/man/fetchAnnotations-KorAPQuery-method.Rd

@@ -5,13 +5,23 @@
 \alias{fetchAnnotations}
 \title{Fetch annotations for all collected matches}
 \usage{
-\S4method{fetchAnnotations}{KorAPQuery}(kqo, foundry = "tt", verbose = kqo@korapConnection@verbose)
+\S4method{fetchAnnotations}{KorAPQuery}(
+  kqo,
+  foundry = "tt",
+  overwrite = FALSE,
+  verbose = kqo@korapConnection@verbose
+)
 }
 \arguments{
 \item{kqo}{object obtained from \code{\link[=corpusQuery]{corpusQuery()}} with collected matches. Note: the original corpus query should have \code{metadataOnly = FALSE} for annotation parsing to work.}
 
 \item{foundry}{string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)}
 
+\item{overwrite}{logical; if TRUE, re-fetch and replace any existing
+annotation columns. If FALSE (default), only add missing annotation layers
+and preserve already fetched ones (e.g., keep POS/lemma from a previous
+foundry while adding morph from another).}
+
 \item{verbose}{print progress information if true}
 }
 \value{

diff --git a/tests/testthat/test-fetchAnnotations.R b/tests/testthat/test-fetchAnnotations.R
index 3ff7c2c..c034295 100644
--- a/tests/testthat/test-fetchAnnotations.R
+++ b/tests/testthat/test-fetchAnnotations.R

@@ -337,3 +337,51 @@
     }
   }
 })
+
+test_that("fetchAnnotations adds missing layer without overwriting existing, and can overwrite when requested", {
+  # Define a separate dummy connection that serves different snippets by foundry
+  if (!isClass("DummyKCO2")) setClass('DummyKCO2', slots = c(apiUrl='character', verbose='logical'))
+  setMethod('apiCall', 'DummyKCO2', function(kco, url, json = TRUE, getHeaders = FALSE, cache = FALSE, timeout = 10) {
+    # Return TT-only snippet by default, and TT+MarMoT morph when foundry=marmot
+    tt_xml <- '<span class="context-left"></span>
+  <span class="match">
+    <mark><span title="tt/l:können tt/p:VVFIN">können</span></mark>&nbsp;<span title="tt/l:alles tt/p:PIS">alles</span><mark><span title="tt/l:außer tt/p:APPR">außer</span></mark>
+  </span>
+  <span class="context-right"></span>'
+    marmot_xml <- '<span class="context-left"></span>
+  <span class="match">
+    <mark><span title="tt/l:können tt/p:VVFIN marmot/m:verbform:fin">können</span></mark>&nbsp;<span title="tt/l:alles tt/p:PIS marmot/m:pos:pron">alles</span><mark><span title="tt/l:außer tt/p:APPR marmot/m:pos:adp|case:acc">außer</span></mark>
+  </span>
+  <span class="context-right"></span>'
+    if (grepl("foundry=marmot", url)) list(snippet = marmot_xml) else list(snippet = tt_xml)
+  })
+
+  # Build query with one match row
+  kco <- new('DummyKCO2', apiUrl = 'http://dummy/', verbose = FALSE)
+  df <- data.frame(textSigle = 'X/Y/Z', matchStart = 1, matchEnd = 3, matchID = 'match-X/Y/Z-p1-3', stringsAsFactors = FALSE)
+  q <- KorAPQuery(korapConnection = kco, collectedMatches = df)
+
+  # First call with TT: should populate pos/lemma, morph empty/NA
+  q1 <- fetchAnnotations(q, foundry = 'tt', verbose = FALSE)
+  pos_tt <- q1@collectedMatches$pos$match[[1]]
+  lem_tt <- q1@collectedMatches$lemma$match[[1]]
+
+  expect_equal(pos_tt, c('VVFIN','PIS','APPR'))
+  expect_equal(lem_tt, c('können','alles','außer'))
+  # Morph should be empty or NA-only at this point
+  morph1 <- q1@collectedMatches$morph$match[[1]]
+  expect_true(length(morph1) == 0 || all(is.na(morph1)))
+
+  # Second call with marmot: should add morph but keep pos/lemma unchanged when overwrite=FALSE
+  q2 <- fetchAnnotations(q1, foundry = 'marmot', verbose = FALSE)
+  expect_equal(q2@collectedMatches$pos$match[[1]], pos_tt)
+  expect_equal(q2@collectedMatches$lemma$match[[1]], lem_tt)
+
+  morph2 <- q2@collectedMatches$morph$match[[1]]
+  expect_equal(morph2, c('verbform:fin','pos:pron','pos:adp|case:acc'))
+
+  # Corrupt existing POS and ensure overwrite=TRUE repairs it
+  q2@collectedMatches$pos$match[[1]][1] <- 'DAMAGED'
+  q3 <- fetchAnnotations(q2, foundry = 'tt', overwrite = TRUE, verbose = FALSE)
+  expect_equal(q3@collectedMatches$pos$match[[1]][1], 'VVFIN')
+})
commit	93787d52e349fae2d6b1a9e2a2ca07bf1ad5dd60	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Sep 03 13:33:25 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Sep 03 13:33:25 2025 +0200
tree	d42eabe0fd2b40c13e21cc31cebae12c301315d4
parent	560b591ea7e4acefff9892d2a1f10cda0d139340 [diff]