Blame - R/KorAPQuery.R - KorAP/RKorAPClient

blob: a8372e34c3ec7ab6c69f14a6cd4afe6b3358b7fa [file] [log] [blame]

Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	1	#' KorAPQuery class (internal)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	2	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	3	#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	4	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	5	#' @keywords internal
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	6	#' @include KorAPConnection.R
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	7	#' @include logging.R
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	8	#' @import httr2
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	9	#'
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	10	#' @include RKorAPClient-package.R
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	11
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	12	#' @export
				13	KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietz	b897218	2019-09-20 21:33:46 +0200	[diff] [blame]	14	"korapConnection",
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	15	"request",
				16	"vc",
				17	"totalResults",
				18	"nextStartIndex",
				19	"fields",
				20	"requestUrl",
				21	"webUIRequestUrl",
				22	"apiResponse",
				23	"collectedMatches",
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	24	"hasMoreMatches"
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	25	))
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	26
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	27	#' Initialize KorAPQuery object
				28	#' @keywords internal
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	29	#' @param .Object …
Marc Kupietz	b897218	2019-09-20 21:33:46 +0200	[diff] [blame]	30	#' @param korapConnection KorAPConnection object
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	31	#' @param request query part of the request URL
				32	#' @param vc definition of a virtual corpus
				33	#' @param totalResults number of hits the query has yielded
				34	#' @param nextStartIndex at what index to start the next fetch of query results
				35	#' @param fields what data / metadata fields should be collected
				36	#' @param requestUrl complete URL of the API request
				37	#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
				38	#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz	7776dec	2019-09-27 16:59:02 +0200	[diff] [blame]	39	#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	40	#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz	97a1bca	2019-10-04 22:52:09 +0200	[diff] [blame]	41	#'
				42	#' @importFrom tibble tibble
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	43	#' @export
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	44	setMethod(
				45	"initialize", "KorAPQuery",
				46	function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
				47	"corpusSigle", "textSigle", "pubDate", "pubPlace",
				48	"availability", "textClass", "snippet", "tokens"
				49	),
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	50	requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	51	.Object <- callNextMethod()
				52	.Object@korapConnection <- korapConnection
				53	.Object@request <- request
				54	.Object@vc <- vc
				55	.Object@totalResults <- totalResults
				56	.Object@nextStartIndex <- nextStartIndex
				57	.Object@fields <- fields
				58	.Object@requestUrl <- requestUrl
				59	.Object@webUIRequestUrl <- webUIRequestUrl
				60	.Object@apiResponse <- apiResponse
				61	.Object@hasMoreMatches <- hasMoreMatches
				62	.Object@collectedMatches <- collectedMatches
				63	.Object
				64	}
				65	)
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	66
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	67	setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
				68	setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
				69	setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
				70	setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	71	setGeneric(
				72	"fetchAnnotations",
				73	function(kqo,
				74	foundry = "tt",
				75	overwrite = FALSE,
				76	verbose = kqo@korapConnection@verbose) standardGeneric("fetchAnnotations")
				77	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	78	setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	79
				80	maxResultsPerPage <- 50
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	81
Marc Kupietz	4de53ec	2019-10-04 09:12:00 +0200	[diff] [blame]	82	## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietz	ef1ef4a	2025-02-19 12:12:40 +0100	[diff] [blame]	83	utils::globalVariables(c("."))
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	84
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	85	#' Search corpus for query terms
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	86	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	87	#' `corpusQuery` performs a corpus query via a connection to a KorAP-API-server
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	88	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	89	#' @family corpus search functions
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	90	#' @aliases corpusQuery
				91	#'
				92	#' @importFrom urltools url_encode
				93	#' @importFrom purrr pmap
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	94	#' @importFrom dplyr bind_rows group_by
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	95	#'
Marc Kupietz	617266d	2025-02-27 10:43:07 +0100	[diff] [blame]	96	#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	97	#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	98	#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	99	#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz	132f005	2023-04-16 14:23:05 +0200	[diff] [blame]	100	#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
				101	#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
				102	#' your RKorAPClient application as explained in the
				103	#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
				104	#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
				105	#' `FALSE`.
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	106	#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	107	#' @param fields character vector specifying which metadata fields to retrieve for each match.
				108	#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
				109	#' \describe{
				110	#' \item{Text identification:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
				111	#' \item{Publication info:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
				112	#' \item{Temporal data:}{`pubDate`, `creationDate` - when text was published/created}
				113	#' \item{Publication details:}{`pubPlace`, `publisher`, `reference` - where/how published}
				114	#' \item{Text classification:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
				115	#' \item{Adminstrative and technical info:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
				116	#' \item{Content data:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
				117	#' \item{System data:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
				118	#' }
				119	#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
				120	#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz	43a6ade	2020-02-18 17:01:44 +0100	[diff] [blame]	121	#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz	25aebc3	2019-09-16 18:40:50 +0200	[diff] [blame]	122	#' @param verbose print some info
Marc Kupietz	4de53ec	2019-10-04 09:12:00 +0200	[diff] [blame]	123	#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	124	#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietz	d9b2fd7	2023-04-17 19:08:50 +0200	[diff] [blame]	125	#' @param context string that specifies the size of the left and the right context returned in `snippet`
				126	#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
				127	#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
				128	#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
				129	#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
				130	#' as this is typically legally motivated.
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	131	#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	132	#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
				133	#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	134	#'
				135	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	136	#' \dontrun{
				137	#'
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	138	#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	139	#' KorAPConnection() \|>
				140	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	141	#' fetchAll()
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	142	#'
				143	#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	144	#' query <- KorAPConnection() \|>
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	145	#' corpusQuery("Ameisenplage",
				146	#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
				147	#' results <- fetchAll(query)
				148	#' results@collectedMatches
Marc Kupietz	657d8e7	2020-02-25 18:31:50 +0100	[diff] [blame]	149	#' }
Marc Kupietz	3c531f6	2019-09-13 12:17:24 +0200	[diff] [blame]	150	#'
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	151	#' \dontrun{
				152	#'
Marc Kupietz	603491f	2019-09-18 14:01:02 +0200	[diff] [blame]	153	#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
				154	#' # and show the number of query hits (but don't fetch them).
Marc Kupietz	69cc54a	2019-09-30 12:06:54 +0200	[diff] [blame]	155	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	156	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	157	#' corpusQuery(
				158	#' KorAPUrl =
				159	#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
				160	#' )
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	161	#' }
				162	#'
				163	#' \dontrun{
Marc Kupietz	3c531f6	2019-09-13 12:17:24 +0200	[diff] [blame]	164	#'
Marc Kupietz	603491f	2019-09-18 14:01:02 +0200	[diff] [blame]	165	#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	166	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	167	#' {
				168	#' . ->> kco
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	169	#' } \|>
				170	#' corpusQuery("Ameisenplage") \|>
				171	#' fetchAll() \|>
				172	#' slot("collectedMatches") \|>
				173	#' mutate(year = lubridate::year(pubDate)) \|>
				174	#' dplyr::select(year) \|>
				175	#' group_by(year) \|>
				176	#' summarise(Count = dplyr::n()) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	177	#' mutate(Freq = mapply(function(f, y) {
				178	#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	179	#' }, Count, year)) \|>
				180	#' dplyr::select(-Count) \|>
				181	#' complete(year = min(year):max(year), fill = list(Freq = 0)) \|>
Marc Kupietz	69cc54a	2019-09-30 12:06:54 +0200	[diff] [blame]	182	#' plot(type = "l")
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	183	#' }
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	184	#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	185	#'
				186	#' @references
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	187	#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	188	#'
				189	#' @export
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	190	setMethod(
				191	"corpusQuery", "KorAPConnection",
				192	function(kco,
				193	query = if (missing(KorAPUrl)) {
				194	stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
				195	} else {
				196	httr2::url_parse(KorAPUrl)$query$q
				197	},
				198	vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
				199	KorAPUrl,
				200	metadataOnly = TRUE,
				201	ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
				202	fields = c(
				203	"corpusSigle",
				204	"textSigle",
				205	"pubDate",
				206	"pubPlace",
				207	"availability",
				208	"textClass",
				209	"snippet",
				210	"tokens"
				211	),
				212	accessRewriteFatal = TRUE,
				213	verbose = kco@verbose,
				214	expand = length(vc) != length(query),
				215	as.df = FALSE,
				216	context = NULL) {
				217	if (length(query) > 1 \|\| length(vc) > 1) {
				218	grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	219
				220	# Initialize timing variables for ETA calculation
				221	total_queries <- nrow(grid)
				222	current_query <- 0
				223	start_time <- Sys.time()
				224
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	225	results <- purrr::pmap(grid, function(query, vc, ...) {
				226	current_query <<- current_query + 1
				227
				228	# Execute the single query directly (avoiding recursive call)
				229	contentFields <- c("snippet", "tokens")
				230	query_fields <- fields
				231	if (metadataOnly) {
				232	query_fields <- query_fields[!query_fields %in% contentFields]
				233	}
				234	if (!"textSigle" %in% query_fields) {
				235	query_fields <- c(query_fields, "textSigle")
				236	}
				237	request <-
				238	paste0(
				239	"?q=",
				240	url_encode(enc2utf8(query)),
				241	ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
				242	ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
				243	ifelse(!metadataOnly, "&show-tokens=true", ""),
				244	"&ql=", ql
				245	)
				246	webUIRequestUrl <- paste0(kco@KorAPUrl, request)
				247	requestUrl <- paste0(
				248	kco@apiUrl,
				249	"search",
				250	request,
				251	"&fields=",
				252	paste(query_fields, collapse = ","),
				253	if (metadataOnly) "&access-rewrite-disabled=true" else ""
				254	)
				255
				256	# Show individual query progress
				257	log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
				258	res <- apiCall(kco, paste0(requestUrl, "&count=0"))
				259	if (is.null(res)) {
				260	log_info(verbose, ": API call failed\n")
				261	totalResults <- 0
				262	} else {
				263	totalResults <- as.integer(res$meta$totalResults)
				264	log_info(verbose, ": ", totalResults, " hits")
				265	if (!is.null(res$meta$cached)) {
				266	log_info(verbose, " [cached]")
				267	} else if (!is.null(res$meta$benchmark)) {
				268	if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
				269	time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
				270	formatted_time <- paste0(round(time_value, 2), "s")
				271	log_info(verbose, ", took ", formatted_time)
				272	} else {
				273	log_info(verbose, ", took ", res$meta$benchmark)
				274	}
				275	}
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	276
				277	# Calculate and display ETA information on the same line if verbose and we have more than one query
				278	if (verbose && total_queries > 1) {
				279	eta_info <- calculate_eta(current_query, total_queries, start_time)
				280	if (eta_info != "") {
				281	elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
				282	avg_time_per_query <- elapsed_time / current_query
				283
				284	# Add ETA info to the same line - remove the leading ". " for cleaner formatting
				285	clean_eta_info <- sub("^\\. ", ". ", eta_info)
				286	log_info(verbose, clean_eta_info)
				287	}
				288	}
				289
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	290	log_info(verbose, "\n")
				291	}
				292
				293	result <- data.frame(
				294	query = query,
				295	totalResults = totalResults,
				296	vc = vc,
				297	webUIRequestUrl = webUIRequestUrl,
				298	stringsAsFactors = FALSE
				299	)
				300
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	301	return(result)
				302	})
				303
				304	results %>% bind_rows()
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	305	} else {
Marc Kupietz	2078bde	2023-08-27 16:46:15 +0200	[diff] [blame]	306	contentFields <- c("snippet", "tokens")
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	307	if (metadataOnly) {
				308	fields <- fields[!fields %in% contentFields]
				309	}
Marc Kupietz	80dc643	2025-02-07 16:57:40 +0100	[diff] [blame]	310	if (!"textSigle" %in% fields) {
				311	fields <- c(fields, "textSigle")
				312	}
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	313	request <-
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	314	paste0(
				315	"?q=",
				316	url_encode(enc2utf8(query)),
				317	ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
				318	ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
				319	ifelse(!metadataOnly, "&show-tokens=true", ""),
				320	"&ql=", ql
				321	)
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	322	webUIRequestUrl <- paste0(kco@KorAPUrl, request)
				323	requestUrl <- paste0(
				324	kco@apiUrl,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	325	"search",
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	326	request,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	327	"&fields=",
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	328	paste(fields, collapse = ","),
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	329	if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	330	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	331	log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
				332	sep =
				333	""
				334	)
				335	res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	336	if (is.null(res)) {
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	337	message("API call failed.")
				338	totalResults <- 0
				339	} else {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	340	totalResults <- as.integer(res$meta$totalResults)
Marc Kupietz	a47d150	2023-04-18 15:26:47 +0200	[diff] [blame]	341	log_info(verbose, ": ", totalResults, " hits")
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	342	if (!is.null(res$meta$cached)) {
Marc Kupietz	a47d150	2023-04-18 15:26:47 +0200	[diff] [blame]	343	log_info(verbose, " [cached]\n")
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	344	} else if (!is.null(res$meta$benchmark)) {
Marc Kupietz	2baf5c5	2025-09-05 16:41:11 +0200	[diff] [blame]	345	# Round the benchmark time to 2 decimal places for better readability.
				346	# Be robust to locales using comma as decimal separator (e.g., "0,12s").
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	347	if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
Marc Kupietz	2baf5c5	2025-09-05 16:41:11 +0200	[diff] [blame]	348	bench_str <- sub("s$", "", res$meta$benchmark)
				349	bench_num <- suppressWarnings(as.numeric(gsub(",", ".", bench_str)))
				350	if (!is.na(bench_num)) {
				351	formatted_time <- paste0(round(bench_num, 2), "s")
				352	} else {
				353	formatted_time <- res$meta$benchmark
				354	}
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	355	log_info(verbose, ", took ", formatted_time, "\n", sep = "")
				356	} else {
				357	# Fallback if the format is different than expected
				358	log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
				359	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	360	} else {
				361	log_info(verbose, "\n")
				362	}
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	363	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	364	if (as.df) {
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	365	data.frame(
				366	query = query,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	367	totalResults = totalResults,
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	368	vc = vc,
				369	webUIRequestUrl = webUIRequestUrl,
				370	stringsAsFactors = FALSE
				371	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	372	} else {
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	373	KorAPQuery(
				374	korapConnection = kco,
				375	nextStartIndex = 0,
				376	fields = fields,
				377	requestUrl = requestUrl,
				378	request = request,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	379	totalResults = totalResults,
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	380	vc = vc,
				381	apiResponse = res,
				382	webUIRequestUrl = webUIRequestUrl,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	383	hasMoreMatches = (totalResults > 0),
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	384	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	385	}
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	386	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	387	}
				388	)
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	389
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	390	#' @importFrom purrr map
				391	repair_data_strcuture <- function(x) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	392	if (is.list(x)) {
				393	as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	394	paste(.x, collapse = " ")
				395	} else {
				396	.x
				397	}))
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	398	} else {
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	399	ifelse(is.na(x), "", x)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	400	}
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	401	}
				402
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	403	#' Fetch the next bunch of results of a KorAP query.
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	404	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	405	#' `fetchNext` fetches the next bunch of results of a KorAP query.
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	406	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	407	#' @family corpus search functions
				408	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	409	#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	410	#' @param offset start offset for query results to fetch
				411	#' @param maxFetch maximum number of query results to fetch
Marc Kupietz	25aebc3	2019-09-16 18:40:50 +0200	[diff] [blame]	412	#' @param verbose print progress information if true
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	413	#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
				414	#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	415	#'
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	416	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	417	#' \dontrun{
				418	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	419	#' q <- KorAPConnection() \|>
				420	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	421	#' fetchNext()
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	422	#' q@collectedMatches
Marc Kupietz	657d8e7	2020-02-25 18:31:50 +0100	[diff] [blame]	423	#' }
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	424	#'
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	425	#' @references
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	426	#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	427	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	428	#' @aliases fetchNext
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	429	#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	430	#' @importFrom tibble enframe add_column
				431	#' @importFrom stringr word
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	432	#' @importFrom tidyr unnest unchop pivot_wider
				433	#' @importFrom purrr map
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	434	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	435	setMethod("fetchNext", "KorAPQuery", function(kqo,
				436	offset = kqo@nextStartIndex,
				437	maxFetch = maxResultsPerPage,
				438	verbose = kqo@korapConnection@verbose,
				439	randomizePageOrder = FALSE) {
Marc Kupietz	a7a8f1b	2024-12-18 15:56:19 +0100	[diff] [blame]	440	# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	441	results <- key <- name <- tmp_positions <- 0
Marc Kupietz	a7a8f1b	2024-12-18 15:56:19 +0100	[diff] [blame]	442
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	443	if (kqo@totalResults == 0 \|\| offset >= kqo@totalResults) {
				444	return(kqo)
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	445	}
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	446	use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	447	# Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	448	collectedMatches <- kqo@collectedMatches
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	449
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	450	# Track start time for ETA calculation
				451	start_time <- Sys.time()
				452
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	453	# For randomized page order, generate a list of randomized page indices
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	454	if (randomizePageOrder) {
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	455	# Calculate how many pages we need to fetch based on maxFetch
				456	total_pages_to_fetch <- if (!is.na(maxFetch)) {
				457	# Either limited by maxFetch or total results, whichever is smaller
				458	min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
				459	} else {
				460	# All pages
				461	ceiling(kqo@totalResults / maxResultsPerPage)
				462	}
				463
				464	# Generate randomized page indices (0-based for API)
				465	pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
				466	page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	467	}
				468
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	469	if (is.null(collectedMatches)) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	470	collectedMatches <- data.frame()
				471	}
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	472
				473	# Initialize the page counter properly based on nextStartIndex and any previously fetched results
				474	# We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
				475	# For first call, this will be 1, for subsequent calls, it will reflect our actual position
				476	current_page_number <- ceiling(offset / maxResultsPerPage) + 1
				477
				478	# For sequential fetches, keep track of which global page we're on
				479	# This is important for correctly showing page numbers in subsequent fetchNext calls
				480	page_count_start <- current_page_number
				481
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	482	repeat {
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	483	# Determine which page to fetch next
				484	if (randomizePageOrder) {
				485	# In randomized mode, get the page from our randomized list using the page_index
				486	# Make sure we don't exceed the array bounds
				487	if (page_index > length(pages)) {
				488	break # No more pages to fetch in randomized mode
				489	}
				490	current_offset_page <- pages[page_index]
				491	# For display purposes in randomized mode, show which page out of the total we're fetching
				492	display_page_number <- page_index
				493	} else {
				494	# In sequential mode, use the current_page_number to calculate the offset
				495	current_offset_page <- (current_page_number - 1)
				496	display_page_number <- current_page_number
				497	}
				498
				499	# Calculate the actual offset in tokens
				500	currentOffset <- current_offset_page * maxResultsPerPage
				501
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	502	# Build the query with the appropriate count and offset using httr2
				503	count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	504
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	505	# Parse existing URL to preserve all query parameters
				506	parsed_url <- httr2::url_parse(kqo@requestUrl)
				507	existing_query <- parsed_url$query
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	508
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	509	# Add/update count and offset parameters
				510	existing_query$count <- count_param
				511	existing_query$offset <- currentOffset
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	512
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	513	# Rebuild the URL with all parameters
				514	query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	515	res <- apiCall(kqo@korapConnection, query)
				516	if (length(res$matches) == 0) {
				517	break
				518	}
				519
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	520	if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) \|\| as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz	16ccf11	2025-01-26 13:25:27 +0100	[diff] [blame]	521	log_info(verbose, "Using fields API: ")
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	522	currentMatches <- res$matches$fields %>%
				523	purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
				524	tibble::enframe() %>%
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	525	tidyr::unnest(cols = value) %>%
				526	tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	527	dplyr::select(-name)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	528	if ("snippet" %in% colnames(res$matches)) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	529	currentMatches$snippet <- res$matches$snippet
				530	}
Marc Kupietz	3cd2c6c	2025-01-08 20:35:39 +0100	[diff] [blame]	531	if ("tokens" %in% colnames(res$matches)) {
				532	currentMatches$tokens <- res$matches$tokens
				533	}
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	534	} else {
				535	currentMatches <- res$matches
				536	}
				537
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	538	for (field in kqo@fields) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	539	if (!field %in% colnames(currentMatches)) {
				540	currentMatches[, field] <- NA
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	541	}
				542	}
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	543	currentMatches <- currentMatches %>%
				544	select(kqo@fields) %>%
				545	mutate(
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	546	matchID = res$matches$matchID,
Marc Kupietz	0447da0	2025-01-08 20:51:09 +0100	[diff] [blame]	547	tmp_positions = gsub(".-p(\\d+)-(\\d+).", "\\1 \\2", res$matches$matchID),
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	548	matchStart = as.integer(stringr::word(tmp_positions, 1)),
				549	matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
				550	) %>%
				551	select(-tmp_positions)
				552
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	553	if (!is.list(collectedMatches)) {
				554	collectedMatches <- currentMatches
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	555	} else {
Marc Kupietz	2078bde	2023-08-27 16:46:15 +0200	[diff] [blame]	556	collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	557	}
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	558
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	559	# Get the actual items per page from the API response
				560	# We now consistently use maxResultsPerPage instead
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	561
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	562	# Calculate total pages consistently using fixed maxResultsPerPage
				563	# This ensures consistent page counting across the function
				564	total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
				565
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	566	# Calculate ETA using the centralized function from logging.R
				567	current_page <- if (randomizePageOrder) page_index else display_page_number
				568	total_pages_to_fetch <- if (!is.na(maxFetch)) {
				569	# Account for offset - we can only fetch from the remaining results after offset
				570	remaining_results_after_offset <- max(0, kqo@totalResults - offset)
				571	min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
				572	} else {
				573	total_pages
				574	}
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	575
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	576	eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	577
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	578	# Extract timing information for display
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	579	time_per_page <- NA
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	580	if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	581	time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	582	}
				583
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	584	# Create the page display string with proper formatting
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	585
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	586	# For global page tracking, calculate the absolute page number
				587	actual_display_number <- if (randomizePageOrder) {
				588	current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
				589	} else {
				590	# In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
				591	current_offset_page + 1
				592	}
				593
				594	# For subsequent calls to fetchNext, we need to calculate the correct page numbers
				595	# based on the current batch being fetched
				596
				597	# For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
				598	# Simply count from 1 within the current batch
				599
				600	# The relative page number is simply the current position in this batch
				601	if (randomizePageOrder) {
				602	relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
				603	} else {
				604	relative_page_number <- display_page_number - (page_count_start - 1)
				605	}
				606
				607	# How many pages will we fetch in this batch?
Marc Kupietz	021663d	2025-06-18 17:49:22 +0200	[diff] [blame]	608	# If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	609	pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz	021663d	2025-06-18 17:49:22 +0200	[diff] [blame]	610	# Account for offset - we can only fetch from the remaining results after offset
				611	remaining_results_after_offset <- max(0, kqo@totalResults - offset)
				612	min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	613	} else {
				614	# Otherwise fetch all remaining pages
				615	total_pages - page_count_start + 1
				616	}
				617
				618	# The total pages to be shown in this batch
				619	batch_total_pages <- pages_in_this_batch
				620
				621	page_display <- paste0(
				622	"Retrieved page ",
				623	sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
				624	"/",
				625	sprintf("%d", batch_total_pages)
				626	)
				627
				628	# If randomized, also show which actual page we fetched
				629	if (randomizePageOrder) {
				630	# Determine the maximum width needed for page numbers (based on total pages)
				631	# This ensures consistent alignment
				632	max_page_width <- nchar(as.character(total_pages))
				633	# Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	634	page_display <- paste0(
				635	page_display,
				636	sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
				637	)
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	638	}
				639	# Always show the absolute page number and total pages (for clarity)
				640	else {
				641	# Show the absolute page number (out of total possible pages)
				642	page_display <- paste0(page_display, sprintf(
				643	" (page %d of %d total)",
				644	actual_display_number, total_pages
				645	))
				646	}
				647
				648	# Add caching or timing information
				649	if (!is.null(res$meta$cached)) {
				650	page_display <- paste0(page_display, " [cached]")
				651	} else {
				652	page_display <- paste0(
				653	page_display,
				654	" in ",
				655	if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	656	"s",
				657	eta_info
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	658	)
				659	}
				660
				661	log_info(verbose, paste0(page_display, "\n"))
				662
				663	# Increment the appropriate counter based on mode
				664	if (randomizePageOrder) {
				665	page_index <- page_index + 1
				666	} else {
				667	current_page_number <- current_page_number + 1
				668	}
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	669	results <- results + res$meta$itemsPerPage
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	670	if (nrow(collectedMatches) >= kqo@totalResults \|\| (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	671	break
				672	}
				673	}
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	674	nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	675	KorAPQuery(
				676	nextStartIndex = nextStartIndex,
Marc Kupietz	d0d3e9b	2019-09-24 17:36:03 +0200	[diff] [blame]	677	korapConnection = kqo@korapConnection,
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	678	fields = kqo@fields,
				679	requestUrl = kqo@requestUrl,
				680	request = kqo@request,
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	681	totalResults = kqo@totalResults,
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	682	vc = kqo@vc,
				683	webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	684	hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	685	apiResponse = res,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	686	collectedMatches = collectedMatches
				687	)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	688	})
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	689
				690	#' Fetch all results of a KorAP query.
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	691	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	692	#' `fetchAll` fetches all results of a KorAP query.
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	693	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	694	#' @family corpus search functions
Marc Kupietz	dc880ac	2025-06-24 20:34:43 +0200	[diff] [blame]	695	#' @param kqo object obtained from [corpusQuery()]
				696	#' @param verbose print progress information if true
				697	#' @param ... further arguments passed to [fetchNext()]
				698	#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	699	#'
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	700	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	701	#' \dontrun{
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	702	#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
				703	#' q <- KorAPConnection() \|>
				704	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	705	#' fetchAll()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	706	#' q@collectedMatches
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	707	#'
				708	#' # Fetch also all KWICs
				709	#' q <- KorAPConnection() \|> auth() \|>
				710	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
				711	#' fetchAll()
				712	#' q@collectedMatches
				713	#'
				714	#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
				715	#' q <- KorAPConnection() \|>
				716	#' corpusQuery("<base/s=t>", # this matches each text once
				717	#' vc = "pubDate in 1958-03-12",
				718	#' fields = c("textSigle", "title"),
				719	#' ) \|>
				720	#' fetchAll()
				721	#' q@collectedMatches
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	722	#' }
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	723	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	724	#' @aliases fetchAll
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	725	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	726	setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
				727	return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	728	})
				729
				730	#' Fetches the remaining results of a KorAP query.
				731	#'
Marc Kupietz	dc880ac	2025-06-24 20:34:43 +0200	[diff] [blame]	732	#' @param kqo object obtained from [corpusQuery()]
				733	#' @param verbose print progress information if true
				734	#' @param ... further arguments passed to [fetchNext()]
				735	#' @return The updated `kqo` object with remaining results in `@collectedMatches`
				736	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	737	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	738	#' \dontrun{
				739	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	740	#' q <- KorAPConnection() \|>
				741	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	742	#' fetchRest()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	743	#' q@collectedMatches
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	744	#' }
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	745	#'
				746	#' @aliases fetchRest
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	747	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	748	setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
				749	return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	750	})
				751
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	752	# Helper to collapse multiple annotation values while preserving order
				753	collapse_features <- function(values) {
				754	if (length(values) == 0) {
				755	return(NA_character_)
				756	}
				757	unique_values <- values[!duplicated(values)]
				758	paste(unique_values, collapse = "\|")
				759	}
				760
				761	# Extract token-level annotations from a DOM node
				762	collect_token_annotations <- function(parent_node) {
				763	if (inherits(parent_node, "xml_missing")) {
				764	return(list(
				765	node = list(),
				766	token = character(0),
				767	lemma = character(0),
				768	pos = character(0),
				769	morph = character(0)
				770	))
				771	}
				772
				773	leaf_nodes <- xml2::xml_find_all(parent_node, ".//span[not(.//span)]")
				774
				775	if (length(leaf_nodes) == 0) {
				776	return(list(
				777	node = list(),
				778	token = character(0),
				779	lemma = character(0),
				780	pos = character(0),
				781	morph = character(0)
				782	))
				783	}
				784
				785	tokens <- character(0)
				786	lemmas <- character(0)
				787	pos_tags <- character(0)
				788	morph_tags <- character(0)
				789	kept_nodes <- list()
				790
				791	for (idx in seq_along(leaf_nodes)) {
				792	leaf <- leaf_nodes[[idx]]
				793	token_text <- trimws(xml2::xml_text(leaf))
				794	if (identical(token_text, "")) {
				795	next
				796	}
				797
				798	kept_nodes[[length(kept_nodes) + 1]] <- leaf
				799	tokens <- c(tokens, token_text)
				800
				801	ancestors <- xml2::xml_find_all(leaf, "ancestor-or-self::span")
				802	titles <- xml2::xml_attr(ancestors, "title")
				803	titles <- titles[!is.na(titles)]
				804
				805	feature_pieces <- if (length(titles) > 0) unlist(strsplit(titles, "[[:space:]]+")) else character(0)
				806
				807	lemma_values <- sub('.?/l:(.)$', '\\1', feature_pieces[grepl('/l:', feature_pieces)], perl = TRUE)
				808	pos_values <- sub('.?/p:(.)$', '\\1', feature_pieces[grepl('/p:', feature_pieces)], perl = TRUE)
				809	morph_values <- sub('.?/m:(.)$', '\\1', feature_pieces[grepl('/m:', feature_pieces)], perl = TRUE)
				810
				811	lemmas <- c(lemmas, collapse_features(lemma_values))
				812	pos_tags <- c(pos_tags, collapse_features(pos_values))
				813	morph_tags <- c(morph_tags, collapse_features(morph_values))
				814	}
				815
				816	list(
				817	node = kept_nodes,
				818	token = tokens,
				819	lemma = lemmas,
				820	pos = pos_tags,
				821	morph = morph_tags
				822	)
				823	}
				824
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	825	#'
				826	#' Parse XML annotations into linguistic layers
				827	#'
				828	#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
				829	#' from XML annotation snippets returned by the KorAP API.
				830	#'
				831	#' @param xml_snippet XML string containing annotation data
				832	#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
				833	#' @keywords internal
				834	parse_xml_annotations <- function(xml_snippet) {
				835	if (is.null(xml_snippet) \|\| is.na(xml_snippet) \|\| xml_snippet == "") {
				836	return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
				837	}
				838
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	839	doc <- tryCatch(xml2::read_html(paste0("<root>", xml_snippet, "</root>")), error = function(e) NULL)
				840	if (is.null(doc)) {
				841	return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame]	842	}
				843
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	844	match_node <- xml2::xml_find_first(doc, ".//span[contains(@class, 'match')]")
				845	if (inherits(match_node, "xml_missing")) {
				846	match_node <- xml2::xml_find_first(doc, ".//span")
				847	if (inherits(match_node, "xml_missing")) {
				848	return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	849	}
				850	}
				851
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	852	token_info <- collect_token_annotations(match_node)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	853
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	854	list(
				855	token = token_info$token,
				856	lemma = token_info$lemma,
				857	pos = token_info$pos,
				858	morph = token_info$morph
				859	)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	860	}
				861
				862	#'
				863	#' Parse XML annotations into linguistic layers with left/match/right structure
				864	#'
				865	#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
				866	#' from XML annotation snippets returned by the KorAP API, split into left context,
				867	#' match, and right context sections like the tokens field.
				868	#'
				869	#' @param xml_snippet XML string containing annotation data
				870	#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
				871	#' @keywords internal
				872	parse_xml_annotations_structured <- function(xml_snippet) {
				873	if (is.null(xml_snippet) \|\| is.na(xml_snippet) \|\| xml_snippet == "") {
				874	empty_result <- list(left = character(0), match = character(0), right = character(0))
				875	return(list(
				876	atokens = empty_result,
				877	lemma = empty_result,
				878	pos = empty_result,
				879	morph = empty_result
				880	))
				881	}
				882
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	883	doc <- tryCatch(xml2::read_html(paste0("<root>", xml_snippet, "</root>")), error = function(e) NULL)
				884	if (is.null(doc)) {
				885	empty_result <- list(left = character(0), match = character(0), right = character(0))
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	886	return(list(
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	887	atokens = empty_result,
				888	lemma = empty_result,
				889	pos = empty_result,
				890	morph = empty_result
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	891	))
				892	}
				893
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	894	match_node <- xml2::xml_find_first(doc, ".//span[contains(@class, 'match')]")
				895	if (inherits(match_node, "xml_missing")) {
				896	empty_result <- list(left = character(0), match = character(0), right = character(0))
				897	return(list(
				898	atokens = empty_result,
				899	lemma = empty_result,
				900	pos = empty_result,
				901	morph = empty_result
				902	))
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	903	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	904
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	905	token_info <- collect_token_annotations(match_node)
				906	tokens <- token_info$token
				907	lemmas <- token_info$lemma
				908	pos_tags <- token_info$pos
				909	morph_tags <- token_info$morph
				910	nodes <- token_info$node
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	911
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	912	if (length(tokens) == 0) {
				913	empty_result <- list(left = character(0), match = character(0), right = character(0))
				914	return(list(
				915	atokens = empty_result,
				916	lemma = empty_result,
				917	pos = empty_result,
				918	morph = empty_result
				919	))
				920	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	921
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	922	mark_flags <- vapply(nodes, function(n) {
				923	!inherits(xml2::xml_find_first(n, "ancestor::mark"), "xml_missing")
				924	}, logical(1))
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	925
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	926	if (any(mark_flags)) {
				927	first_idx <- which(mark_flags)[1]
				928	last_idx <- tail(which(mark_flags), 1)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	929	} else {
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	930	first_idx <- 1
				931	last_idx <- length(tokens)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	932	}
				933
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	934	sections <- rep("match", length(tokens))
				935	if (first_idx > 1) {
				936	sections[seq_len(first_idx - 1)] <- "left"
				937	}
				938	if (last_idx < length(tokens)) {
				939	sections[seq(from = last_idx + 1, to = length(tokens))] <- "right"
				940	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	941
Marc Kupietz	bdedd02	2025-10-09 14:14:15 +0200	[diff] [blame]	942	subset_by_section <- function(values, section) {
				943	idx <- sections == section
				944	if (!any(idx)) {
				945	return(character(0))
				946	}
				947	values[idx]
				948	}
				949
				950	atokens <- list(
				951	left = subset_by_section(tokens, "left"),
				952	match = subset_by_section(tokens, "match"),
				953	right = subset_by_section(tokens, "right")
				954	)
				955
				956	lemma <- list(
				957	left = subset_by_section(lemmas, "left"),
				958	match = subset_by_section(lemmas, "match"),
				959	right = subset_by_section(lemmas, "right")
				960	)
				961
				962	pos <- list(
				963	left = subset_by_section(pos_tags, "left"),
				964	match = subset_by_section(pos_tags, "match"),
				965	right = subset_by_section(pos_tags, "right")
				966	)
				967
				968	morph <- list(
				969	left = subset_by_section(morph_tags, "left"),
				970	match = subset_by_section(morph_tags, "match"),
				971	right = subset_by_section(morph_tags, "right")
				972	)
				973
				974	list(
				975	atokens = atokens,
				976	lemma = lemma,
				977	pos = pos,
				978	morph = morph
				979	)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	980	}
				981
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	982	#' Fetch annotations for all collected matches
				983	#'
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	984	#' `r lifecycle::badge("experimental")`
				985	#'
				986	#' `fetchAnnotations` fetches annotations (only token annotations, for now)
				987	#' for all matches in the `@collectedMatches` slot
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	988	#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	989	#' data frame. The method uses the `matchID` from collected matches.
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	990	#'
				991	#' Important: For copyright-restricted corpora, users must be authorized via [auth()]
				992	#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
				993	#' available for annotation parsing.
				994	#'
				995	#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
				996	#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
				997	#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
				998	#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
				999	#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
				1000	#' - `annotation_snippet`: original XML snippet from the annotation API
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1001	#'
				1002	#' @family corpus search functions
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1003	#' @concept Annotations
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1004	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1005	#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1006	#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1007	#' @param overwrite logical; if TRUE, re-fetch and replace any existing
				1008	#' annotation columns. If FALSE (default), only add missing annotation layers
				1009	#' and preserve already fetched ones (e.g., keep POS/lemma from a previous
				1010	#' foundry while adding morph from another).
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1011	#' @param verbose print progress information if true
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1012	#' @return The updated `kqo` object with annotation columns
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1013	#' like `pos`, `lemma`, `morph` (and `atokens` and `annotation_snippet`)
				1014	#' in the `@collectedMatches` slot. Each column is a data frame
				1015	#' with `left`, `match`, and `right` columns containing list vectors of annotations
				1016	#' for the left context, matched tokens, and right context, respectively.
				1017	#' The original XML snippet for each match is also stored in `annotation_snippet`.
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1018	#'
				1019	#' @examples
				1020	#' \dontrun{
				1021	#'
				1022	#' # Fetch annotations for matches using Tree-Tagger foundry
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1023	#' # Note: Authorization required for copyright-restricted corpora
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1024	#' q <- KorAPConnection() \|>
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1025	#' auth() \|>
				1026	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1027	#' fetchNext(maxFetch = 10) \|>
				1028	#' fetchAnnotations()
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1029	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1030	#' # Access linguistic annotations for match i:
Marc Kupietz	6aa5a0d	2025-09-08 17:51:47 +0200	[diff] [blame]	1031	#' pos_tags <- q@collectedMatches$pos
				1032	#' # Data frame with left/match/right columns for POS tags
				1033	#' lemmas <- q@collectedMatches$lemma
				1034	#' # Data frame with left/match/right columns for lemmas
				1035	#' morphology <- q@collectedMatches$morph
				1036	#' # Data frame with left/match/right columns for morphological tags
				1037	#' atokens <- q@collectedMatches$atokens
				1038	#' # Data frame with left/match/right columns for annotation token text
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1039	#' # Original XML snippet for match i
				1040	#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]]
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1041	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1042	#' # Access specific components:
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1043	#' # POS tags for the matched tokens in match i
				1044	#' match_pos <- q@collectedMatches$pos$match[[i]]
				1045	#' # Lemmas for the left context in match i
				1046	#' left_lemmas <- q@collectedMatches$lemma$left[[i]]
				1047	#' # Token text for the right context in match i
				1048	#' right_tokens <- q@collectedMatches$atokens$right[[i]]
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1049	#'
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1050	#' # Use a different foundry (e.g., MarMoT)
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1051	#' q <- KorAPConnection() \|>
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1052	#' auth() \|>
				1053	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1054	#' fetchNext(maxFetch = 10) \|>
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1055	#' fetchAnnotations(foundry = "marmot")
				1056	#' q@collectedMatches$pos$left[1] # POS tags for the left context of the first match
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1057	#' }
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1058	#' @export
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1059	setMethod("fetchAnnotations", "KorAPQuery", function(kqo,
				1060	foundry = "tt",
				1061	overwrite = FALSE,
				1062	verbose = kqo@korapConnection@verbose) {
				1063	if (is.null(kqo@collectedMatches) \|\|
				1064	nrow(kqo@collectedMatches) == 0) {
				1065	warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
				1066	return(kqo)
				1067	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1068
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1069	df <- kqo@collectedMatches
				1070	kco <- kqo@korapConnection
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1071
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1072	# Initialize annotation columns as data frames (like tokens field)
				1073	# Create the structure more explicitly to avoid assignment issues
				1074	nrows <- nrow(df)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1075
Marc Kupietz	03d2b1a	2025-07-19 09:14:45 +0200	[diff] [blame]	1076	# Pre-compute the empty character vector list to avoid repeated computation
				1077	empty_char_list <- I(replicate(nrows, character(0), simplify = FALSE))
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1078
Marc Kupietz	03d2b1a	2025-07-19 09:14:45 +0200	[diff] [blame]	1079	# Helper function to create annotation data frame structure
				1080	create_annotation_df <- function(empty_list) {
				1081	data.frame(
				1082	left = empty_list,
				1083	match = empty_list,
				1084	right = empty_list,
				1085	stringsAsFactors = FALSE
				1086	)
				1087	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1088
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1089	# Track which annotation columns already existed to decide overwrite behavior
				1090	existing_types <- list(
				1091	pos = "pos" %in% colnames(df),
				1092	lemma = "lemma" %in% colnames(df),
				1093	morph = "morph" %in% colnames(df),
				1094	atokens = "atokens" %in% colnames(df),
				1095	annotation_snippet = "annotation_snippet" %in% colnames(df)
				1096	)
				1097
				1098	# Initialize annotation columns using the helper function
Marc Kupietz	03d2b1a	2025-07-19 09:14:45 +0200	[diff] [blame]	1099	annotation_types <- c("pos", "lemma", "morph", "atokens")
				1100	for (type in annotation_types) {
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1101	if (overwrite \|\| !existing_types[[type]]) {
				1102	df[[type]] <- create_annotation_df(empty_char_list)
				1103	}
Marc Kupietz	03d2b1a	2025-07-19 09:14:45 +0200	[diff] [blame]	1104	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1105
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1106	if (overwrite \|\| !existing_types$annotation_snippet) {
feldmueller	a02f193	2025-09-15 16:38:06 +0200	[diff] [blame]	1107	df$annotation_snippet <- rep(NA_character_, nrows) # Fixed line
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1108	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1109
Marc Kupietz	e8c0fef	2025-07-18 19:59:04 +0200	[diff] [blame]	1110	# Initialize timing for ETA calculation
				1111	start_time <- Sys.time()
				1112	if (verbose) {
				1113	log_info(verbose, paste("Starting to fetch annotations for", nrows, "matches\n"))
				1114	}
				1115
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1116	# Helper to decide if existing annotation row is effectively empty
				1117	is_empty_annotation_row <- function(ann_df, row_index) {
				1118	if (is.null(ann_df) \|\| nrow(ann_df) < row_index) return(TRUE)
				1119	left_val <- ann_df$left[[row_index]]
				1120	match_val <- ann_df$match[[row_index]]
				1121	right_val <- ann_df$right[[row_index]]
				1122	all(
				1123	(is.null(left_val) \|\| (length(left_val) == 0) \|\| all(is.na(left_val))),
				1124	(is.null(match_val) \|\| (length(match_val) == 0) \|\| all(is.na(match_val))),
				1125	(is.null(right_val) \|\| (length(right_val) == 0) \|\| all(is.na(right_val)))
				1126	)
				1127	}
				1128
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1129	for (i in seq_len(nrow(df))) {
Marc Kupietz	e8c0fef	2025-07-18 19:59:04 +0200	[diff] [blame]	1130	# ETA logging
				1131	if (verbose && i > 1) {
				1132	eta_info <- calculate_eta(i, nrows, start_time)
				1133	log_info(verbose, paste("Fetching annotations for match", i, "of", nrows, eta_info, "\n"))
				1134	}
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1135	# Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
				1136	if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1137	# matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
				1138	# "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
				1139	# Extract document path and position, handling both regular and encrypted formats
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1140
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1141	# More flexible regex to extract the document path with position and encryption
				1142	# Look for pattern: match-(...)-p(\d+)-(\d+)(.) where (.) is the encrypted part
				1143	# We need to capture the entire path including the encrypted suffix
				1144	match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1145
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1146	if (match_result > 0) {
				1147	# Extract the complete path including encryption (everything after "match-")
				1148	doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
				1149	# Convert the dash before position to slash, but keep everything after the position
				1150	match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
Marc Kupietz	2512130	2025-07-19 08:45:43 +0200	[diff] [blame]	1151	# Use httr2 to construct URL safely
				1152	base_url <- paste0(kco@apiUrl, "corpus/", match_path)
				1153	req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1154	} else {
Marc Kupietz	2512130	2025-07-19 08:45:43 +0200	[diff] [blame]	1155	# If regex fails, fall back to the old method with httr2
				1156	# Format numbers to avoid scientific notation
				1157	match_start <- format(df$matchStart[i], scientific = FALSE)
				1158	match_end <- format(df$matchEnd[i], scientific = FALSE)
				1159	base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
				1160	req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1161	}
				1162	} else {
Marc Kupietz	2512130	2025-07-19 08:45:43 +0200	[diff] [blame]	1163	# Fallback to the old method with httr2
				1164	# Format numbers to avoid scientific notation
				1165	match_start <- format(df$matchStart[i], scientific = FALSE)
				1166	match_end <- format(df$matchEnd[i], scientific = FALSE)
				1167	base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
				1168	req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1169	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1170
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1171	tryCatch({
				1172	res <- apiCall(kco, req)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1173
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1174	if (!is.null(res)) {
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1175	# Store the raw annotation snippet (respect overwrite flag)
				1176	if (overwrite \|\| !existing_types$annotation_snippet \|\| is.null(df$annotation_snippet[[i]]) \|\| is.na(df$annotation_snippet[[i]])) {
				1177	df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
				1178	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1179
				1180	# Parse XML annotations if snippet is available
				1181	if (is.list(res) && "snippet" %in% names(res)) {
				1182	parsed_annotations <- parse_xml_annotations_structured(res$snippet)
				1183
				1184	# Store the parsed linguistic data in data frame format (like tokens)
				1185	# Use individual assignment to avoid data frame mismatch errors
				1186	tryCatch({
				1187	# Assign POS annotations
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1188	if (overwrite \|\| !existing_types$pos \|\| is_empty_annotation_row(df$pos, i)) {
				1189	df$pos$left[i] <- list(parsed_annotations$pos$left)
				1190	df$pos$match[i] <- list(parsed_annotations$pos$match)
				1191	df$pos$right[i] <- list(parsed_annotations$pos$right)
				1192	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1193
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1194	# Assign lemma annotations
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1195	if (overwrite \|\| !existing_types$lemma \|\| is_empty_annotation_row(df$lemma, i)) {
				1196	df$lemma$left[i] <- list(parsed_annotations$lemma$left)
				1197	df$lemma$match[i] <- list(parsed_annotations$lemma$match)
				1198	df$lemma$right[i] <- list(parsed_annotations$lemma$right)
				1199	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1200
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1201	# Assign morphology annotations
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1202	if (overwrite \|\| !existing_types$morph \|\| is_empty_annotation_row(df$morph, i)) {
				1203	df$morph$left[i] <- list(parsed_annotations$morph$left)
				1204	df$morph$match[i] <- list(parsed_annotations$morph$match)
				1205	df$morph$right[i] <- list(parsed_annotations$morph$right)
				1206	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1207
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1208	# Assign token annotations
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1209	if (overwrite \|\| !existing_types$atokens \|\| is_empty_annotation_row(df$atokens, i)) {
				1210	df$atokens$left[i] <- list(parsed_annotations$atokens$left)
				1211	df$atokens$match[i] <- list(parsed_annotations$atokens$match)
				1212	df$atokens$right[i] <- list(parsed_annotations$atokens$right)
				1213	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1214	}, error = function(assign_error) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1215	# Set empty character vectors on assignment error using list assignment
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1216	if (overwrite \|\| !existing_types$pos) {
				1217	df$pos$left[i] <<- list(character(0))
				1218	df$pos$match[i] <<- list(character(0))
				1219	df$pos$right[i] <<- list(character(0))
				1220	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1221
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1222	if (overwrite \|\| !existing_types$lemma) {
				1223	df$lemma$left[i] <<- list(character(0))
				1224	df$lemma$match[i] <<- list(character(0))
				1225	df$lemma$right[i] <<- list(character(0))
				1226	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1227
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1228	if (overwrite \|\| !existing_types$morph) {
				1229	df$morph$left[i] <<- list(character(0))
				1230	df$morph$match[i] <<- list(character(0))
				1231	df$morph$right[i] <<- list(character(0))
				1232	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1233
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1234	if (overwrite \|\| !existing_types$atokens) {
				1235	df$atokens$left[i] <<- list(character(0))
				1236	df$atokens$match[i] <<- list(character(0))
				1237	df$atokens$right[i] <<- list(character(0))
				1238	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1239	})
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1240	} else {
				1241	# No snippet available, store empty vectors
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1242	if (overwrite \|\| !existing_types$pos) {
				1243	df$pos$left[i] <- list(character(0))
				1244	df$pos$match[i] <- list(character(0))
				1245	df$pos$right[i] <- list(character(0))
				1246	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1247
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1248	if (overwrite \|\| !existing_types$lemma) {
				1249	df$lemma$left[i] <- list(character(0))
				1250	df$lemma$match[i] <- list(character(0))
				1251	df$lemma$right[i] <- list(character(0))
				1252	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1253
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1254	if (overwrite \|\| !existing_types$morph) {
				1255	df$morph$left[i] <- list(character(0))
				1256	df$morph$match[i] <- list(character(0))
				1257	df$morph$right[i] <- list(character(0))
				1258	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1259
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1260	if (overwrite \|\| !existing_types$atokens) {
				1261	df$atokens$left[i] <- list(character(0))
				1262	df$atokens$match[i] <- list(character(0))
				1263	df$atokens$right[i] <- list(character(0))
				1264	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1265	}
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1266	} else {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1267	# Store NAs for failed requests
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1268	if (overwrite \|\| !existing_types$pos) {
				1269	df$pos$left[i] <- list(NA)
				1270	df$pos$match[i] <- list(NA)
				1271	df$pos$right[i] <- list(NA)
				1272	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1273
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1274	if (overwrite \|\| !existing_types$lemma) {
				1275	df$lemma$left[i] <- list(NA)
				1276	df$lemma$match[i] <- list(NA)
				1277	df$lemma$right[i] <- list(NA)
				1278	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1279
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1280	if (overwrite \|\| !existing_types$morph) {
				1281	df$morph$left[i] <- list(NA)
				1282	df$morph$match[i] <- list(NA)
				1283	df$morph$right[i] <- list(NA)
				1284	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1285
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1286	if (overwrite \|\| !existing_types$atokens) {
				1287	df$atokens$left[i] <- list(NA)
				1288	df$atokens$match[i] <- list(NA)
				1289	df$atokens$right[i] <- list(NA)
				1290	}
				1291	if (overwrite \|\| !existing_types$annotation_snippet) {
				1292	df$annotation_snippet[[i]] <- NA
				1293	}
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1294	}
				1295	}, error = function(e) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1296	# Store NAs for failed requests
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1297	if (overwrite \|\| !existing_types$pos) {
				1298	df$pos$left[i] <- list(NA)
				1299	df$pos$match[i] <- list(NA)
				1300	df$pos$right[i] <- list(NA)
				1301	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1302
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1303	if (overwrite \|\| !existing_types$lemma) {
				1304	df$lemma$left[i] <- list(NA)
				1305	df$lemma$match[i] <- list(NA)
				1306	df$lemma$right[i] <- list(NA)
				1307	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1308
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1309	if (overwrite \|\| !existing_types$morph) {
				1310	df$morph$left[i] <- list(NA)
				1311	df$morph$match[i] <- list(NA)
				1312	df$morph$right[i] <- list(NA)
				1313	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1314
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1315	if (overwrite \|\| !existing_types$atokens) {
				1316	df$atokens$left[i] <- list(NA)
				1317	df$atokens$match[i] <- list(NA)
				1318	df$atokens$right[i] <- list(NA)
				1319	}
				1320	if (overwrite \|\| !existing_types$annotation_snippet) {
				1321	df$annotation_snippet[[i]] <- NA
				1322	}
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1323	})
				1324	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1325
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1326	# Validate data frame structure before assignment
				1327	if (nrow(df) != nrow(kqo@collectedMatches)) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1328	}
				1329
				1330	# Update the collectedMatches with annotation data
				1331	tryCatch({
				1332	kqo@collectedMatches <- df
				1333	}, error = function(assign_error) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1334	# Try a safer approach: add columns individually
				1335	tryCatch({
				1336	kqo@collectedMatches$pos <- df$pos
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1337	kqo@collectedMatches$lemma <- df$lemma
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1338	kqo@collectedMatches$morph <- df$morph
				1339	kqo@collectedMatches$atokens <- df$atokens
				1340	kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
				1341	}, error = function(col_error) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1342	warning("Failed to add annotation data to collectedMatches")
				1343	})
				1344	})
				1345
Marc Kupietz	e8c0fef	2025-07-18 19:59:04 +0200	[diff] [blame]	1346	if (verbose) {
				1347	elapsed_time <- Sys.time() - start_time
				1348	log_info(verbose, paste("Finished fetching annotations for", nrows, "matches in", format_duration(as.numeric(elapsed_time, units = "secs")), "\n"))
				1349	}
				1350
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1351	return(kqo)
				1352	})
				1353
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1354	#' Query frequencies of search expressions in virtual corpora
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1355	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	1356	#' `frequencyQuery` combines [corpusQuery()], [corpusStats()] and
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1357	#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1358	#' confidence intervals of one ore multiple search terms across one or multiple
				1359	#' virtual corpora.
				1360	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	1361	#' @family frequency analysis
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1362	#' @aliases frequencyQuery
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1363	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	1364	#' \dontrun{
				1365	#'
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1366	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1367	#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	1368	#' }
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1369	#'
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1370	# @inheritParams corpusQuery
Marc Kupietz	617266d	2025-02-27 10:43:07 +0100	[diff] [blame]	1371	#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1372	#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
				1373	#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	1374	#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
				1375	#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1376	#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1377	#' @export
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1378	#'
				1379	#' @return A tibble, with each row containing the following result columns for query and vc combinations:
				1380	#' - query: the query string used for the frequency analysis.
				1381	#' - totalResults: absolute frequency of query matches in the vc.
				1382	#' - vc: virtual corpus used for the query.
				1383	#' - webUIRequestUrl: URL of the corresponding web UI request with respect to query and vc.
				1384	#' - total: total number of words in vc.
				1385	#' - f: relative frequency of query matches in the vc.
				1386	#' - conf.low: lower bound of the confidence interval for the relative frequency, given `conf.level`.
				1387	#' - conf.high: upper bound of the confidence interval for the relative frequency, given `conf.level`.
				1388
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1389	setMethod(
				1390	"frequencyQuery", "KorAPConnection",
Marc Kupietz	71d6e05	2019-11-22 18:42:10 +0100	[diff] [blame]	1391	function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1392	(if (as.alternatives) {
				1393	corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) \|>
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	1394	group_by(vc) \|>
Marc Kupietz	71d6e05	2019-11-22 18:42:10 +0100	[diff] [blame]	1395	mutate(total = sum(totalResults))
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1396	} else {
				1397	corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) \|>
				1398	mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	1399	}) \|>
Marc Kupietz	0c29cea	2019-10-09 08:44:36 +0200	[diff] [blame]	1400	ci(conf.level = conf.level)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1401	}
				1402	)
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1403
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1404	#' buildWebUIRequestUrlFromString
				1405	#'
				1406	#' @rdname KorAPQuery-class
				1407	#' @importFrom urltools url_encode
				1408	#' @export
				1409	buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1410	query,
				1411	vc = "",
				1412	ql = "poliqarp") {
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1413	if ("KorAPConnection" %in% class(KorAPUrl)) {
				1414	KorAPUrl <- KorAPUrl@KorAPUrl
				1415	}
				1416
				1417	request <-
				1418	paste0(
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1419	"?q=",
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1420	urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1421	ifelse(vc != "",
				1422	paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
				1423	""
				1424	),
				1425	"&ql=",
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1426	ql
				1427	)
				1428	paste0(KorAPUrl, request)
				1429	}
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1430
				1431	#' buildWebUIRequestUrl
				1432	#'
				1433	#' @rdname KorAPQuery-class
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1434	#' @importFrom httr2 url_parse
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1435	#' @export
				1436	buildWebUIRequestUrl <- function(kco,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1437	query = if (missing(KorAPUrl)) {
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1438	stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1439	} else {
				1440	httr2::url_parse(KorAPUrl)$query$q
				1441	},
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1442	vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1443	KorAPUrl,
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1444	ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1445	buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1446	}
				1447
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1448	#' format()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1449	#' @rdname KorAPQuery-class
				1450	#' @param x KorAPQuery object
				1451	#' @param ... further arguments passed to or from other methods
Marc Kupietz	b73ca0f	2025-01-28 20:45:01 +0100	[diff] [blame]	1452	#' @importFrom urltools param_get url_decode
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1453	#' @export
				1454	format.KorAPQuery <- function(x, ...) {
				1455	cat("<KorAPQuery>\n")
				1456	q <- x
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1457	param <- urltools::param_get(q@request) \|> lapply(urltools::url_decode)
Marc Kupietz	b73ca0f	2025-01-28 20:45:01 +0100	[diff] [blame]	1458	cat(" Query: ", param$q, "\n")
				1459	if (!is.null(param$cq) && param$cq != "") {
				1460	cat(" Virtual corpus: ", param$cq, "\n")
				1461	}
				1462	if (!is.null(q@collectedMatches)) {
				1463	cat("==============================================================================================================", "\n")
				1464	print(summary(q@collectedMatches))
				1465	cat("==============================================================================================================", "\n")
				1466	}
				1467	cat(" Total results: ", q@totalResults, "\n")
				1468	cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1469	if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
				1470	successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
				1471	parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
				1472	cat(" Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
				1473	if (parsed_annotations > 0) {
				1474	cat(" (", parsed_annotations, " with parsed linguistic data)")
				1475	}
				1476	cat("\n")
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1477	}
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1478	}
				1479
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1480	#' show()
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1481	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1482	#' @rdname KorAPQuery-class
				1483	#' @param object KorAPQuery object
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1484	#' @export
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1485	setMethod("show", "KorAPQuery", function(object) {
				1486	format(object)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1487	invisible(object)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1488	})