Blame - R/KorAPQuery.R - KorAP/RKorAPClient

blob: 182ac3ef646ebbf2b4f6efb01913559faa289649 [file] [log] [blame]

Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	1	#' KorAPQuery class (internal)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	2	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	3	#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	4	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	5	#' @keywords internal
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	6	#' @include KorAPConnection.R
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	7	#' @include logging.R
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	8	#' @import httr2
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	9	#'
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	10	#' @include RKorAPClient-package.R
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	11
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	12	#' @export
				13	KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietz	b897218	2019-09-20 21:33:46 +0200	[diff] [blame]	14	"korapConnection",
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	15	"request",
				16	"vc",
				17	"totalResults",
				18	"nextStartIndex",
				19	"fields",
				20	"requestUrl",
				21	"webUIRequestUrl",
				22	"apiResponse",
				23	"collectedMatches",
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	24	"hasMoreMatches"
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	25	))
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	26
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	27	#' Initialize KorAPQuery object
				28	#' @keywords internal
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	29	#' @param .Object …
Marc Kupietz	b897218	2019-09-20 21:33:46 +0200	[diff] [blame]	30	#' @param korapConnection KorAPConnection object
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	31	#' @param request query part of the request URL
				32	#' @param vc definition of a virtual corpus
				33	#' @param totalResults number of hits the query has yielded
				34	#' @param nextStartIndex at what index to start the next fetch of query results
				35	#' @param fields what data / metadata fields should be collected
				36	#' @param requestUrl complete URL of the API request
				37	#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
				38	#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz	7776dec	2019-09-27 16:59:02 +0200	[diff] [blame]	39	#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	40	#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz	97a1bca	2019-10-04 22:52:09 +0200	[diff] [blame]	41	#'
				42	#' @importFrom tibble tibble
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	43	#' @export
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	44	setMethod(
				45	"initialize", "KorAPQuery",
				46	function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
				47	"corpusSigle", "textSigle", "pubDate", "pubPlace",
				48	"availability", "textClass", "snippet", "tokens"
				49	),
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	50	requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	51	.Object <- callNextMethod()
				52	.Object@korapConnection <- korapConnection
				53	.Object@request <- request
				54	.Object@vc <- vc
				55	.Object@totalResults <- totalResults
				56	.Object@nextStartIndex <- nextStartIndex
				57	.Object@fields <- fields
				58	.Object@requestUrl <- requestUrl
				59	.Object@webUIRequestUrl <- webUIRequestUrl
				60	.Object@apiResponse <- apiResponse
				61	.Object@hasMoreMatches <- hasMoreMatches
				62	.Object@collectedMatches <- collectedMatches
				63	.Object
				64	}
				65	)
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	66
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	67	setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
				68	setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
				69	setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
				70	setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	71	setGeneric("fetchAnnotations", function(kqo, ...) standardGeneric("fetchAnnotations"))
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	72	setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	73
				74	maxResultsPerPage <- 50
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	75
Marc Kupietz	4de53ec	2019-10-04 09:12:00 +0200	[diff] [blame]	76	## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietz	ef1ef4a	2025-02-19 12:12:40 +0100	[diff] [blame]	77	utils::globalVariables(c("."))
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	78
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	79	#' Search corpus for query terms
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	80	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	81	#' `corpusQuery` performs a corpus query via a connection to a KorAP-API-server
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	82	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	83	#' @family corpus search functions
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	84	#' @aliases corpusQuery
				85	#'
				86	#' @importFrom urltools url_encode
				87	#' @importFrom purrr pmap
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	88	#' @importFrom dplyr bind_rows group_by
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	89	#'
Marc Kupietz	617266d	2025-02-27 10:43:07 +0100	[diff] [blame]	90	#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	91	#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	92	#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	93	#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz	132f005	2023-04-16 14:23:05 +0200	[diff] [blame]	94	#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
				95	#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
				96	#' your RKorAPClient application as explained in the
				97	#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
				98	#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
				99	#' `FALSE`.
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	100	#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	101	#' @param fields character vector specifying which metadata fields to retrieve for each match.
				102	#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
				103	#' \describe{
				104	#' \item{Text identification:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
				105	#' \item{Publication info:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
				106	#' \item{Temporal data:}{`pubDate`, `creationDate` - when text was published/created}
				107	#' \item{Publication details:}{`pubPlace`, `publisher`, `reference` - where/how published}
				108	#' \item{Text classification:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
				109	#' \item{Adminstrative and technical info:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
				110	#' \item{Content data:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
				111	#' \item{System data:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
				112	#' }
				113	#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
				114	#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz	43a6ade	2020-02-18 17:01:44 +0100	[diff] [blame]	115	#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz	25aebc3	2019-09-16 18:40:50 +0200	[diff] [blame]	116	#' @param verbose print some info
Marc Kupietz	4de53ec	2019-10-04 09:12:00 +0200	[diff] [blame]	117	#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	118	#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietz	d9b2fd7	2023-04-17 19:08:50 +0200	[diff] [blame]	119	#' @param context string that specifies the size of the left and the right context returned in `snippet`
				120	#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
				121	#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
				122	#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
				123	#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
				124	#' as this is typically legally motivated.
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	125	#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	126	#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
				127	#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	128	#'
				129	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	130	#' \dontrun{
				131	#'
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	132	#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	133	#' KorAPConnection() \|>
				134	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	135	#' fetchAll()
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	136	#'
				137	#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	138	#' query <- KorAPConnection() \|>
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	139	#' corpusQuery("Ameisenplage",
				140	#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
				141	#' results <- fetchAll(query)
				142	#' results@collectedMatches
Marc Kupietz	657d8e7	2020-02-25 18:31:50 +0100	[diff] [blame]	143	#' }
Marc Kupietz	3c531f6	2019-09-13 12:17:24 +0200	[diff] [blame]	144	#'
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	145	#' \dontrun{
				146	#'
Marc Kupietz	603491f	2019-09-18 14:01:02 +0200	[diff] [blame]	147	#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
				148	#' # and show the number of query hits (but don't fetch them).
Marc Kupietz	69cc54a	2019-09-30 12:06:54 +0200	[diff] [blame]	149	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	150	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	151	#' corpusQuery(
				152	#' KorAPUrl =
				153	#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
				154	#' )
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	155	#' }
				156	#'
				157	#' \dontrun{
Marc Kupietz	3c531f6	2019-09-13 12:17:24 +0200	[diff] [blame]	158	#'
Marc Kupietz	603491f	2019-09-18 14:01:02 +0200	[diff] [blame]	159	#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	160	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	161	#' {
				162	#' . ->> kco
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	163	#' } \|>
				164	#' corpusQuery("Ameisenplage") \|>
				165	#' fetchAll() \|>
				166	#' slot("collectedMatches") \|>
				167	#' mutate(year = lubridate::year(pubDate)) \|>
				168	#' dplyr::select(year) \|>
				169	#' group_by(year) \|>
				170	#' summarise(Count = dplyr::n()) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	171	#' mutate(Freq = mapply(function(f, y) {
				172	#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	173	#' }, Count, year)) \|>
				174	#' dplyr::select(-Count) \|>
				175	#' complete(year = min(year):max(year), fill = list(Freq = 0)) \|>
Marc Kupietz	69cc54a	2019-09-30 12:06:54 +0200	[diff] [blame]	176	#' plot(type = "l")
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	177	#' }
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	178	#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	179	#'
				180	#' @references
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	181	#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	182	#'
				183	#' @export
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	184	setMethod(
				185	"corpusQuery", "KorAPConnection",
				186	function(kco,
				187	query = if (missing(KorAPUrl)) {
				188	stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
				189	} else {
				190	httr2::url_parse(KorAPUrl)$query$q
				191	},
				192	vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
				193	KorAPUrl,
				194	metadataOnly = TRUE,
				195	ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
				196	fields = c(
				197	"corpusSigle",
				198	"textSigle",
				199	"pubDate",
				200	"pubPlace",
				201	"availability",
				202	"textClass",
				203	"snippet",
				204	"tokens"
				205	),
				206	accessRewriteFatal = TRUE,
				207	verbose = kco@verbose,
				208	expand = length(vc) != length(query),
				209	as.df = FALSE,
				210	context = NULL) {
				211	if (length(query) > 1 \|\| length(vc) > 1) {
				212	grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	213
				214	# Initialize timing variables for ETA calculation
				215	total_queries <- nrow(grid)
				216	current_query <- 0
				217	start_time <- Sys.time()
				218
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	219	results <- purrr::pmap(grid, function(query, vc, ...) {
				220	current_query <<- current_query + 1
				221
				222	# Execute the single query directly (avoiding recursive call)
				223	contentFields <- c("snippet", "tokens")
				224	query_fields <- fields
				225	if (metadataOnly) {
				226	query_fields <- query_fields[!query_fields %in% contentFields]
				227	}
				228	if (!"textSigle" %in% query_fields) {
				229	query_fields <- c(query_fields, "textSigle")
				230	}
				231	request <-
				232	paste0(
				233	"?q=",
				234	url_encode(enc2utf8(query)),
				235	ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
				236	ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
				237	ifelse(!metadataOnly, "&show-tokens=true", ""),
				238	"&ql=", ql
				239	)
				240	webUIRequestUrl <- paste0(kco@KorAPUrl, request)
				241	requestUrl <- paste0(
				242	kco@apiUrl,
				243	"search",
				244	request,
				245	"&fields=",
				246	paste(query_fields, collapse = ","),
				247	if (metadataOnly) "&access-rewrite-disabled=true" else ""
				248	)
				249
				250	# Show individual query progress
				251	log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
				252	res <- apiCall(kco, paste0(requestUrl, "&count=0"))
				253	if (is.null(res)) {
				254	log_info(verbose, ": API call failed\n")
				255	totalResults <- 0
				256	} else {
				257	totalResults <- as.integer(res$meta$totalResults)
				258	log_info(verbose, ": ", totalResults, " hits")
				259	if (!is.null(res$meta$cached)) {
				260	log_info(verbose, " [cached]")
				261	} else if (!is.null(res$meta$benchmark)) {
				262	if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
				263	time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
				264	formatted_time <- paste0(round(time_value, 2), "s")
				265	log_info(verbose, ", took ", formatted_time)
				266	} else {
				267	log_info(verbose, ", took ", res$meta$benchmark)
				268	}
				269	}
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	270
				271	# Calculate and display ETA information on the same line if verbose and we have more than one query
				272	if (verbose && total_queries > 1) {
				273	eta_info <- calculate_eta(current_query, total_queries, start_time)
				274	if (eta_info != "") {
				275	elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
				276	avg_time_per_query <- elapsed_time / current_query
				277
				278	# Add ETA info to the same line - remove the leading ". " for cleaner formatting
				279	clean_eta_info <- sub("^\\. ", ". ", eta_info)
				280	log_info(verbose, clean_eta_info)
				281	}
				282	}
				283
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	284	log_info(verbose, "\n")
				285	}
				286
				287	result <- data.frame(
				288	query = query,
				289	totalResults = totalResults,
				290	vc = vc,
				291	webUIRequestUrl = webUIRequestUrl,
				292	stringsAsFactors = FALSE
				293	)
				294
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	295	return(result)
				296	})
				297
				298	results %>% bind_rows()
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	299	} else {
Marc Kupietz	2078bde	2023-08-27 16:46:15 +0200	[diff] [blame]	300	contentFields <- c("snippet", "tokens")
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	301	if (metadataOnly) {
				302	fields <- fields[!fields %in% contentFields]
				303	}
Marc Kupietz	80dc643	2025-02-07 16:57:40 +0100	[diff] [blame]	304	if (!"textSigle" %in% fields) {
				305	fields <- c(fields, "textSigle")
				306	}
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	307	request <-
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	308	paste0(
				309	"?q=",
				310	url_encode(enc2utf8(query)),
				311	ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
				312	ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
				313	ifelse(!metadataOnly, "&show-tokens=true", ""),
				314	"&ql=", ql
				315	)
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	316	webUIRequestUrl <- paste0(kco@KorAPUrl, request)
				317	requestUrl <- paste0(
				318	kco@apiUrl,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	319	"search",
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	320	request,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	321	"&fields=",
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	322	paste(fields, collapse = ","),
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	323	if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	324	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	325	log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
				326	sep =
				327	""
				328	)
				329	res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	330	if (is.null(res)) {
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	331	message("API call failed.")
				332	totalResults <- 0
				333	} else {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	334	totalResults <- as.integer(res$meta$totalResults)
Marc Kupietz	a47d150	2023-04-18 15:26:47 +0200	[diff] [blame]	335	log_info(verbose, ": ", totalResults, " hits")
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	336	if (!is.null(res$meta$cached)) {
Marc Kupietz	a47d150	2023-04-18 15:26:47 +0200	[diff] [blame]	337	log_info(verbose, " [cached]\n")
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	338	} else if (!is.null(res$meta$benchmark)) {
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	339	# Round the benchmark time to 2 decimal places for better readability
				340	# If it's a string ending with 's', extract the number, round it, and re-add 's'
				341	if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
				342	time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
				343	formatted_time <- paste0(round(time_value, 2), "s")
				344	log_info(verbose, ", took ", formatted_time, "\n", sep = "")
				345	} else {
				346	# Fallback if the format is different than expected
				347	log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
				348	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	349	} else {
				350	log_info(verbose, "\n")
				351	}
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	352	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	353	if (as.df) {
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	354	data.frame(
				355	query = query,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	356	totalResults = totalResults,
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	357	vc = vc,
				358	webUIRequestUrl = webUIRequestUrl,
				359	stringsAsFactors = FALSE
				360	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	361	} else {
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	362	KorAPQuery(
				363	korapConnection = kco,
				364	nextStartIndex = 0,
				365	fields = fields,
				366	requestUrl = requestUrl,
				367	request = request,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	368	totalResults = totalResults,
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	369	vc = vc,
				370	apiResponse = res,
				371	webUIRequestUrl = webUIRequestUrl,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	372	hasMoreMatches = (totalResults > 0),
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	373	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	374	}
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	375	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	376	}
				377	)
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	378
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	379	#' @importFrom purrr map
				380	repair_data_strcuture <- function(x) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	381	if (is.list(x)) {
				382	as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	383	paste(.x, collapse = " ")
				384	} else {
				385	.x
				386	}))
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	387	} else {
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	388	ifelse(is.na(x), "", x)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	389	}
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	390	}
				391
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	392	#' Fetch the next bunch of results of a KorAP query.
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	393	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	394	#' `fetchNext` fetches the next bunch of results of a KorAP query.
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	395	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	396	#' @family corpus search functions
				397	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	398	#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	399	#' @param offset start offset for query results to fetch
				400	#' @param maxFetch maximum number of query results to fetch
Marc Kupietz	25aebc3	2019-09-16 18:40:50 +0200	[diff] [blame]	401	#' @param verbose print progress information if true
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	402	#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
				403	#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	404	#'
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	405	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	406	#' \dontrun{
				407	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	408	#' q <- KorAPConnection() \|>
				409	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	410	#' fetchNext()
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	411	#' q@collectedMatches
Marc Kupietz	657d8e7	2020-02-25 18:31:50 +0100	[diff] [blame]	412	#' }
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	413	#'
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	414	#' @references
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	415	#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	416	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	417	#' @aliases fetchNext
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	418	#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	419	#' @importFrom tibble enframe add_column
				420	#' @importFrom stringr word
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	421	#' @importFrom tidyr unnest unchop pivot_wider
				422	#' @importFrom purrr map
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	423	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	424	setMethod("fetchNext", "KorAPQuery", function(kqo,
				425	offset = kqo@nextStartIndex,
				426	maxFetch = maxResultsPerPage,
				427	verbose = kqo@korapConnection@verbose,
				428	randomizePageOrder = FALSE) {
Marc Kupietz	a7a8f1b	2024-12-18 15:56:19 +0100	[diff] [blame]	429	# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	430	results <- key <- name <- tmp_positions <- 0
Marc Kupietz	a7a8f1b	2024-12-18 15:56:19 +0100	[diff] [blame]	431
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	432	if (kqo@totalResults == 0 \|\| offset >= kqo@totalResults) {
				433	return(kqo)
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	434	}
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	435	use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	436	# Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	437	collectedMatches <- kqo@collectedMatches
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	438
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	439	# Track start time for ETA calculation
				440	start_time <- Sys.time()
				441
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	442	# For randomized page order, generate a list of randomized page indices
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	443	if (randomizePageOrder) {
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	444	# Calculate how many pages we need to fetch based on maxFetch
				445	total_pages_to_fetch <- if (!is.na(maxFetch)) {
				446	# Either limited by maxFetch or total results, whichever is smaller
				447	min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
				448	} else {
				449	# All pages
				450	ceiling(kqo@totalResults / maxResultsPerPage)
				451	}
				452
				453	# Generate randomized page indices (0-based for API)
				454	pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
				455	page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	456	}
				457
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	458	if (is.null(collectedMatches)) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	459	collectedMatches <- data.frame()
				460	}
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	461
				462	# Initialize the page counter properly based on nextStartIndex and any previously fetched results
				463	# We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
				464	# For first call, this will be 1, for subsequent calls, it will reflect our actual position
				465	current_page_number <- ceiling(offset / maxResultsPerPage) + 1
				466
				467	# For sequential fetches, keep track of which global page we're on
				468	# This is important for correctly showing page numbers in subsequent fetchNext calls
				469	page_count_start <- current_page_number
				470
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	471	repeat {
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	472	# Determine which page to fetch next
				473	if (randomizePageOrder) {
				474	# In randomized mode, get the page from our randomized list using the page_index
				475	# Make sure we don't exceed the array bounds
				476	if (page_index > length(pages)) {
				477	break # No more pages to fetch in randomized mode
				478	}
				479	current_offset_page <- pages[page_index]
				480	# For display purposes in randomized mode, show which page out of the total we're fetching
				481	display_page_number <- page_index
				482	} else {
				483	# In sequential mode, use the current_page_number to calculate the offset
				484	current_offset_page <- (current_page_number - 1)
				485	display_page_number <- current_page_number
				486	}
				487
				488	# Calculate the actual offset in tokens
				489	currentOffset <- current_offset_page * maxResultsPerPage
				490
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	491	# Build the query with the appropriate count and offset using httr2
				492	count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	493
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	494	# Parse existing URL to preserve all query parameters
				495	parsed_url <- httr2::url_parse(kqo@requestUrl)
				496	existing_query <- parsed_url$query
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	497
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	498	# Add/update count and offset parameters
				499	existing_query$count <- count_param
				500	existing_query$offset <- currentOffset
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	501
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	502	# Rebuild the URL with all parameters
				503	query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	504	res <- apiCall(kqo@korapConnection, query)
				505	if (length(res$matches) == 0) {
				506	break
				507	}
				508
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	509	if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) \|\| as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz	16ccf11	2025-01-26 13:25:27 +0100	[diff] [blame]	510	log_info(verbose, "Using fields API: ")
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	511	currentMatches <- res$matches$fields %>%
				512	purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
				513	tibble::enframe() %>%
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	514	tidyr::unnest(cols = value) %>%
				515	tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	516	dplyr::select(-name)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	517	if ("snippet" %in% colnames(res$matches)) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	518	currentMatches$snippet <- res$matches$snippet
				519	}
Marc Kupietz	3cd2c6c	2025-01-08 20:35:39 +0100	[diff] [blame]	520	if ("tokens" %in% colnames(res$matches)) {
				521	currentMatches$tokens <- res$matches$tokens
				522	}
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	523	} else {
				524	currentMatches <- res$matches
				525	}
				526
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	527	for (field in kqo@fields) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	528	if (!field %in% colnames(currentMatches)) {
				529	currentMatches[, field] <- NA
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	530	}
				531	}
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	532	currentMatches <- currentMatches %>%
				533	select(kqo@fields) %>%
				534	mutate(
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	535	matchID = res$matches$matchID,
Marc Kupietz	0447da0	2025-01-08 20:51:09 +0100	[diff] [blame]	536	tmp_positions = gsub(".-p(\\d+)-(\\d+).", "\\1 \\2", res$matches$matchID),
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	537	matchStart = as.integer(stringr::word(tmp_positions, 1)),
				538	matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
				539	) %>%
				540	select(-tmp_positions)
				541
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	542	if (!is.list(collectedMatches)) {
				543	collectedMatches <- currentMatches
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	544	} else {
Marc Kupietz	2078bde	2023-08-27 16:46:15 +0200	[diff] [blame]	545	collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	546	}
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	547
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	548	# Get the actual items per page from the API response
				549	# We now consistently use maxResultsPerPage instead
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	550
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	551	# Calculate total pages consistently using fixed maxResultsPerPage
				552	# This ensures consistent page counting across the function
				553	total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
				554
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	555	# Calculate ETA using the centralized function from logging.R
				556	current_page <- if (randomizePageOrder) page_index else display_page_number
				557	total_pages_to_fetch <- if (!is.na(maxFetch)) {
				558	# Account for offset - we can only fetch from the remaining results after offset
				559	remaining_results_after_offset <- max(0, kqo@totalResults - offset)
				560	min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
				561	} else {
				562	total_pages
				563	}
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	564
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	565	eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	566
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	567	# Extract timing information for display
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	568	time_per_page <- NA
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	569	if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	570	time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	571	}
				572
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	573	# Create the page display string with proper formatting
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	574
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	575	# For global page tracking, calculate the absolute page number
				576	actual_display_number <- if (randomizePageOrder) {
				577	current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
				578	} else {
				579	# In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
				580	current_offset_page + 1
				581	}
				582
				583	# For subsequent calls to fetchNext, we need to calculate the correct page numbers
				584	# based on the current batch being fetched
				585
				586	# For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
				587	# Simply count from 1 within the current batch
				588
				589	# The relative page number is simply the current position in this batch
				590	if (randomizePageOrder) {
				591	relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
				592	} else {
				593	relative_page_number <- display_page_number - (page_count_start - 1)
				594	}
				595
				596	# How many pages will we fetch in this batch?
Marc Kupietz	021663d	2025-06-18 17:49:22 +0200	[diff] [blame]	597	# If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	598	pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz	021663d	2025-06-18 17:49:22 +0200	[diff] [blame]	599	# Account for offset - we can only fetch from the remaining results after offset
				600	remaining_results_after_offset <- max(0, kqo@totalResults - offset)
				601	min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	602	} else {
				603	# Otherwise fetch all remaining pages
				604	total_pages - page_count_start + 1
				605	}
				606
				607	# The total pages to be shown in this batch
				608	batch_total_pages <- pages_in_this_batch
				609
				610	page_display <- paste0(
				611	"Retrieved page ",
				612	sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
				613	"/",
				614	sprintf("%d", batch_total_pages)
				615	)
				616
				617	# If randomized, also show which actual page we fetched
				618	if (randomizePageOrder) {
				619	# Determine the maximum width needed for page numbers (based on total pages)
				620	# This ensures consistent alignment
				621	max_page_width <- nchar(as.character(total_pages))
				622	# Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	623	page_display <- paste0(
				624	page_display,
				625	sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
				626	)
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	627	}
				628	# Always show the absolute page number and total pages (for clarity)
				629	else {
				630	# Show the absolute page number (out of total possible pages)
				631	page_display <- paste0(page_display, sprintf(
				632	" (page %d of %d total)",
				633	actual_display_number, total_pages
				634	))
				635	}
				636
				637	# Add caching or timing information
				638	if (!is.null(res$meta$cached)) {
				639	page_display <- paste0(page_display, " [cached]")
				640	} else {
				641	page_display <- paste0(
				642	page_display,
				643	" in ",
				644	if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	645	"s",
				646	eta_info
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	647	)
				648	}
				649
				650	log_info(verbose, paste0(page_display, "\n"))
				651
				652	# Increment the appropriate counter based on mode
				653	if (randomizePageOrder) {
				654	page_index <- page_index + 1
				655	} else {
				656	current_page_number <- current_page_number + 1
				657	}
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	658	results <- results + res$meta$itemsPerPage
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	659	if (nrow(collectedMatches) >= kqo@totalResults \|\| (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	660	break
				661	}
				662	}
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	663	nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	664	KorAPQuery(
				665	nextStartIndex = nextStartIndex,
Marc Kupietz	d0d3e9b	2019-09-24 17:36:03 +0200	[diff] [blame]	666	korapConnection = kqo@korapConnection,
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	667	fields = kqo@fields,
				668	requestUrl = kqo@requestUrl,
				669	request = kqo@request,
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	670	totalResults = kqo@totalResults,
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	671	vc = kqo@vc,
				672	webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	673	hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	674	apiResponse = res,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	675	collectedMatches = collectedMatches
				676	)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	677	})
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	678
				679	#' Fetch all results of a KorAP query.
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	680	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	681	#' `fetchAll` fetches all results of a KorAP query.
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	682	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	683	#' @family corpus search functions
Marc Kupietz	dc880ac	2025-06-24 20:34:43 +0200	[diff] [blame]	684	#' @param kqo object obtained from [corpusQuery()]
				685	#' @param verbose print progress information if true
				686	#' @param ... further arguments passed to [fetchNext()]
				687	#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	688	#'
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	689	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	690	#' \dontrun{
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	691	#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
				692	#' q <- KorAPConnection() \|>
				693	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	694	#' fetchAll()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	695	#' q@collectedMatches
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	696	#'
				697	#' # Fetch also all KWICs
				698	#' q <- KorAPConnection() \|> auth() \|>
				699	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
				700	#' fetchAll()
				701	#' q@collectedMatches
				702	#'
				703	#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
				704	#' q <- KorAPConnection() \|>
				705	#' corpusQuery("<base/s=t>", # this matches each text once
				706	#' vc = "pubDate in 1958-03-12",
				707	#' fields = c("textSigle", "title"),
				708	#' ) \|>
				709	#' fetchAll()
				710	#' q@collectedMatches
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	711	#' }
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	712	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	713	#' @aliases fetchAll
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	714	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	715	setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
				716	return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	717	})
				718
				719	#' Fetches the remaining results of a KorAP query.
				720	#'
Marc Kupietz	dc880ac	2025-06-24 20:34:43 +0200	[diff] [blame]	721	#' @param kqo object obtained from [corpusQuery()]
				722	#' @param verbose print progress information if true
				723	#' @param ... further arguments passed to [fetchNext()]
				724	#' @return The updated `kqo` object with remaining results in `@collectedMatches`
				725	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	726	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	727	#' \dontrun{
				728	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	729	#' q <- KorAPConnection() \|>
				730	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	731	#' fetchRest()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	732	#' q@collectedMatches
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	733	#' }
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	734	#'
				735	#' @aliases fetchRest
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	736	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	737	setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
				738	return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	739	})
				740
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	741	#'
				742	#' Parse XML annotations into linguistic layers
				743	#'
				744	#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
				745	#' from XML annotation snippets returned by the KorAP API.
				746	#'
				747	#' @param xml_snippet XML string containing annotation data
				748	#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
				749	#' @keywords internal
				750	parse_xml_annotations <- function(xml_snippet) {
				751	if (is.null(xml_snippet) \|\| is.na(xml_snippet) \|\| xml_snippet == "") {
				752	return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
				753	}
				754
				755	# Extract content within <span class="match">...</span> using a more robust approach
				756	if (grepl('<span class="match">', xml_snippet)) {
				757	# Find the start of match span
				758	start_pos <- regexpr('<span class="match">', xml_snippet)
				759	if (start_pos > 0) {
				760	# Find the end by counting nested spans
				761	content_start <- start_pos + attr(start_pos, "match.length")
				762	remaining <- substr(xml_snippet, content_start, nchar(xml_snippet))
				763
				764	# Simple approach: extract everything until we hit context-right or end
				765	if (grepl('<span class="context-right">', remaining)) {
				766	content_to_parse <- gsub('(.?)<span class="context-right">.', '\\1', remaining)
				767	} else {
				768	# Find the closing </span> that matches our opening span
				769	# For now, use a simpler approach - take everything until the last </span> sequence
				770	content_to_parse <- gsub('(.)</span>\\s$', '\\1', remaining)
				771	}
				772	} else {
				773	content_to_parse <- xml_snippet
				774	}
				775	} else {
				776	content_to_parse <- xml_snippet
				777	}
				778
				779	# Initialize result vectors
				780	tokens <- character(0)
				781	lemmas <- character(0)
				782	pos_tags <- character(0)
				783	morph_tags <- character(0)
				784
				785	# Split the content by </span> and process each meaningful part
				786	parts <- unlist(strsplit(content_to_parse, '</span>'))
				787
				788	for (part in parts) {
				789	part <- trimws(part)
				790	if (nchar(part) == 0) next
				791
				792	# Look for parts that have title attributes and end with text
				793	if (grepl('<span[^>]*title=', part)) {
				794	# Extract the text content (everything after the last >)
				795	text_content <- gsub('.>([^<])$', '\\1', part)
				796	text_content <- trimws(text_content)
				797
				798	if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
				799	tokens <- c(tokens, text_content)
				800
				801	# Extract all title attributes from this part
				802	title_pattern <- 'title="([^"]*)"'
				803	title_matches <- gregexpr(title_pattern, part)
				804
				805	lemma <- NA
				806	pos_tag <- NA
				807	morph_tag <- NA
				808
				809	if (title_matches[[1]][1] != -1) {
				810	all_titles <- regmatches(part, title_matches)[[1]]
				811	for (title_match in all_titles) {
				812	title_content <- gsub(title_pattern, '\\1', title_match)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	813
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	814	# Split by spaces and process each annotation
				815	annotations <- unlist(strsplit(title_content, "\\s+"))
				816	for (annotation in annotations) {
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	817	if (grepl('^[^/]+/l:', annotation)) {
				818	lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
				819	} else if (grepl('^[^/]+/p:', annotation)) {
				820	pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
				821	} else if (grepl('^[^/]+/m:', annotation)) {
				822	morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	823	}
				824	}
				825	}
				826	}
				827
				828	lemmas <- c(lemmas, lemma)
				829	pos_tags <- c(pos_tags, pos_tag)
				830	morph_tags <- c(morph_tags, morph_tag)
				831	}
				832	}
				833	}
				834
				835	# If no tokens found with the splitting approach, try a different method
				836	if (length(tokens) == 0) {
				837	# Look for the innermost spans that contain actual text
				838	innermost_pattern <- '<span[^>]title="([^"])"[^>]*>([^<]+)</span>'
				839	innermost_matches <- gregexpr(innermost_pattern, content_to_parse, perl = TRUE)
				840
				841	if (innermost_matches[[1]][1] != -1) {
				842	matches <- regmatches(content_to_parse, innermost_matches)[[1]]
				843
				844	for (match in matches) {
				845	title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
				846	text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
				847	text <- trimws(text)
				848
				849	if (nchar(text) > 0) {
				850	tokens <- c(tokens, text)
				851
				852	# Parse space-separated annotations in title
				853	lemma <- NA
				854	pos_tag <- NA
				855	morph_tag <- NA
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	856
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	857	annotations <- unlist(strsplit(title, "\\s+"))
				858	for (annotation in annotations) {
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	859	if (grepl('^[^/]+/l:', annotation)) {
				860	lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
				861	} else if (grepl('^[^/]+/p:', annotation)) {
				862	pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
				863	} else if (grepl('^[^/]+/m:', annotation)) {
				864	morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	865	}
				866	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	867
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	868	lemmas <- c(lemmas, lemma)
				869	pos_tags <- c(pos_tags, pos_tag)
				870	morph_tags <- c(morph_tags, morph_tag)
				871	}
				872	}
				873	}
				874	}
				875
				876	# Ensure all vectors have the same length
				877	max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
				878	if (max_length > 0) {
				879	tokens <- c(tokens, rep(NA, max_length - length(tokens)))
				880	lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
				881	pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
				882	morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
				883	}
				884
				885	return(list(
				886	token = tokens,
				887	lemma = lemmas,
				888	pos = pos_tags,
				889	morph = morph_tags
				890	))
				891	}
				892
				893	#'
				894	#' Parse XML annotations into linguistic layers with left/match/right structure
				895	#'
				896	#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
				897	#' from XML annotation snippets returned by the KorAP API, split into left context,
				898	#' match, and right context sections like the tokens field.
				899	#'
				900	#' @param xml_snippet XML string containing annotation data
				901	#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
				902	#' @keywords internal
				903	parse_xml_annotations_structured <- function(xml_snippet) {
				904	if (is.null(xml_snippet) \|\| is.na(xml_snippet) \|\| xml_snippet == "") {
				905	empty_result <- list(left = character(0), match = character(0), right = character(0))
				906	return(list(
				907	atokens = empty_result,
				908	lemma = empty_result,
				909	pos = empty_result,
				910	morph = empty_result
				911	))
				912	}
				913
				914	# Helper function to extract annotations from a span section
				915	extract_annotations_from_section <- function(section_content) {
Marc Kupietz	7ff770e	2025-07-18 19:07:10 +0200	[diff] [blame]	916	# Handle both spaced tokens and nested single tokens
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	917	tokens <- character(0)
				918	lemmas <- character(0)
				919	pos_tags <- character(0)
				920	morph_tags <- character(0)
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	921
Marc Kupietz	7ff770e	2025-07-18 19:07:10 +0200	[diff] [blame]	922	# First try to split by spaces between span groups (for multiple tokens)
				923	# Look for spaces that separate token groups
				924	if (grepl('</span>\\s+<span', section_content)) {
				925	# Multiple tokens separated by spaces
				926	token_groups <- unlist(strsplit(section_content, '(?<=</span>)\\s+(?=<span)', perl = TRUE))
				927	} else {
				928	# Single token (or no spaces between tokens)
				929	token_groups <- c(section_content)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	930	}
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	931
Marc Kupietz	7ff770e	2025-07-18 19:07:10 +0200	[diff] [blame]	932	for (group in token_groups) {
				933	group <- trimws(group)
				934	if (nchar(group) == 0) next
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	935
Marc Kupietz	7ff770e	2025-07-18 19:07:10 +0200	[diff] [blame]	936	# Extract the actual text content (the innermost text)
				937	text_match <- regexpr('>([^<>]+)</span>', group, perl = TRUE)
				938	if (text_match > 0) {
				939	# Find all possible text contents and take the last one (innermost)
				940	all_texts <- regmatches(group, gregexpr('>([^<>]+)</span>', group, perl = TRUE))[[1]]
				941	if (length(all_texts) > 0) {
				942	# Take the last match (innermost text)
				943	text_content <- sub('.>([^<>]+)</span>.', '\\1', all_texts[length(all_texts)], perl = TRUE)
				944	text_content <- trimws(text_content)
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	945
Marc Kupietz	7ff770e	2025-07-18 19:07:10 +0200	[diff] [blame]	946	if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
				947	tokens <- c(tokens, text_content)
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	948
Marc Kupietz	7ff770e	2025-07-18 19:07:10 +0200	[diff] [blame]	949	# Extract all title attributes from this group
				950	titles <- regmatches(group, gregexpr('title="([^"]*)"', group, perl = TRUE))[[1]]
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	951
Marc Kupietz	7ff770e	2025-07-18 19:07:10 +0200	[diff] [blame]	952	morph_features <- character(0)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	953	lemma <- NA
				954	pos_tag <- NA
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	955
Marc Kupietz	7ff770e	2025-07-18 19:07:10 +0200	[diff] [blame]	956	for (title in titles) {
				957	content <- sub('title="([^"]*)"', '\\1', title, perl = TRUE)
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	958
Marc Kupietz	7ff770e	2025-07-18 19:07:10 +0200	[diff] [blame]	959	if (grepl('^[^/]+/l:', content)) {
				960	lemma <- sub('^[^/]+/l:(.*)$', '\\1', content)
				961	} else if (grepl('^[^/]+/p:', content)) {
				962	pos_tag <- sub('^[^/]+/p:(.*)$', '\\1', content)
				963	} else if (grepl('^[^/]+/m:', content)) {
				964	morph_feature <- sub('^[^/]+/m:(.*)$', '\\1', content)
				965	morph_features <- c(morph_features, morph_feature)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	966	}
				967	}
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	968
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	969	lemmas <- c(lemmas, lemma)
				970	pos_tags <- c(pos_tags, pos_tag)
Marc Kupietz	7ff770e	2025-07-18 19:07:10 +0200	[diff] [blame]	971	morph_tag <- if (length(morph_features) > 0) paste(morph_features, collapse = "\|") else NA
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	972	morph_tags <- c(morph_tags, morph_tag)
				973	}
				974	}
				975	}
				976	}
				977
				978	# Ensure all vectors have the same length
				979	max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
				980	if (max_length > 0) {
				981	tokens <- c(tokens, rep(NA, max_length - length(tokens)))
				982	lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
				983	pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
				984	morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
				985	}
				986
				987	return(list(
				988	tokens = tokens,
				989	lemmas = lemmas,
				990	pos_tags = pos_tags,
				991	morph_tags = morph_tags
				992	))
				993	}
				994
				995	# Split the XML into three parts: left context, match content, and right context
				996	# The structure is: <span class="match">...left...<mark>...match...</mark>...right...</span>
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	997
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	998	# First extract the content within the match span using DOTALL modifier
				999	match_span_pattern <- '(?s)<span class="match">(.?)</span>\\s<span class="context-right">'
				1000	match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1001
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1002	if (match_span_match == -1) {
				1003	# Try alternative pattern if no context-right
				1004	match_span_pattern <- '(?s)<span class="match">(.?)</span>\\s$'
				1005	match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
				1006	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1007
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1008	if (match_span_match > 0) {
				1009	match_span_content <- gsub(match_span_pattern, '\\1', xml_snippet, perl = TRUE)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1010
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1011	# Now find the <mark> and </mark> positions within this content
				1012	mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
				1013	mark_end <- regexpr('</mark>', match_span_content, perl = TRUE)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1014
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1015	if (mark_start > 0 && mark_end > 0) {
				1016	# Left context: everything before <mark>
				1017	left_content <- substr(match_span_content, 1, mark_start - 1)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1018
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1019	# Match content: everything between <mark> and </mark> (including the mark tags for now)
				1020	match_content <- substr(match_span_content, mark_start, mark_end + attr(mark_end, "match.length") - 1)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1021
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1022	# Right context: everything after </mark>
				1023	right_content_start <- mark_end + attr(mark_end, "match.length")
				1024	right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
				1025	} else {
				1026	# No mark tags found, treat entire match span as match content
				1027	left_content <- ""
				1028	match_content <- match_span_content
				1029	right_content <- ""
				1030	}
				1031	} else {
				1032	# No match span found, treat entire content as match
				1033	left_content <- ""
				1034	match_content <- xml_snippet
				1035	right_content <- ""
				1036	}
				1037
				1038	# Process each section
				1039	left_annotations <- extract_annotations_from_section(left_content)
				1040	match_annotations <- extract_annotations_from_section(match_content)
				1041	right_annotations <- extract_annotations_from_section(right_content)
				1042
				1043	return(list(
				1044	atokens = list(
				1045	left = left_annotations$tokens,
				1046	match = match_annotations$tokens,
				1047	right = right_annotations$tokens
				1048	),
				1049	lemma = list(
				1050	left = left_annotations$lemmas,
				1051	match = match_annotations$lemmas,
				1052	right = right_annotations$lemmas
				1053	),
				1054	pos = list(
				1055	left = left_annotations$pos_tags,
				1056	match = match_annotations$pos_tags,
				1057	right = right_annotations$pos_tags
				1058	),
				1059	morph = list(
				1060	left = left_annotations$morph_tags,
				1061	match = match_annotations$morph_tags,
				1062	right = right_annotations$morph_tags
				1063	)
				1064	))
				1065	}
				1066
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1067	#' Fetch annotations for all collected matches
				1068	#'
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	1069	#' `r lifecycle::badge("experimental")`
				1070	#'
				1071	#' `fetchAnnotations` fetches annotations (only token annotations, for now)
				1072	#' for all matches in the `@collectedMatches` slot
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1073	#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	1074	#' data frame. The method uses the `matchID` from collected matches.
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1075	#'
				1076	#' Important: For copyright-restricted corpora, users must be authorized via [auth()]
				1077	#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
				1078	#' available for annotation parsing.
				1079	#'
				1080	#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
				1081	#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
				1082	#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
				1083	#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
				1084	#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
				1085	#' - `annotation_snippet`: original XML snippet from the annotation API
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1086	#'
				1087	#' @family corpus search functions
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	1088	#' @concept Annotations
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1089	#' @aliases fetchAnnotations
				1090	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1091	#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1092	#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
				1093	#' @param verbose print progress information if true
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	1094	#' @return The updated `kqo` object with annotation columns
				1095	#' like `pos`, `lemma`, `morph` (and `atokens` and `annotation_snippet`)
				1096	#' in the `@collectedMatches` slot. Each column is a data frame
				1097	#' with `left`, `match`, and `right` columns containing list vectors of annotations
				1098	#' for the left context, matched tokens, and right context, respectively.
				1099	#' The original XML snippet for each match is also stored in `annotation_snippet`.
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1100	#'
				1101	#' @examples
				1102	#' \dontrun{
				1103	#'
				1104	#' # Fetch annotations for matches using Tree-Tagger foundry
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1105	#' # Note: Authorization required for copyright-restricted corpora
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1106	#' q <- KorAPConnection() \|>
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1107	#' auth() \|>
				1108	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1109	#' fetchNext(maxFetch = 10) \|>
				1110	#' fetchAnnotations()
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1111	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1112	#' # Access linguistic annotations for match i:
				1113	#' pos_tags <- q@collectedMatches$pos # Data frame with left/match/right columns for POS tags
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1114	#' lemmas <- q@collectedMatches$lemma # Data frame with left/match/right columns for lemmas
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1115	#' morphology <- q@collectedMatches$morph # Data frame with left/match/right columns for morphological tags
				1116	#' atokens <- q@collectedMatches$atokens # Data frame with left/match/right columns for annotation token text
				1117	#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]] # Original XML snippet for match i
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1118	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1119	#' # Access specific components:
				1120	#' match_pos <- q@collectedMatches$pos$match[[i]] # POS tags for the matched tokens in match i
				1121	#' left_lemmas <- q@collectedMatches$lemma$left[[i]] # Lemmas for the left context in match i
				1122	#' right_tokens <- q@collectedMatches$atokens$right[[i]] # Token text for the right context in match i
				1123	#'
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	1124	#' # Use a different foundry (e.g., MarMoT)
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1125	#' q <- KorAPConnection() \|>
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1126	#' auth() \|>
				1127	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1128	#' fetchNext(maxFetch = 10) \|>
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame^]	1129	#' fetchAnnotations(foundry = "marmot")
				1130	#' q@collectedMatches$pos$left[1] # POS tags for the left context of the first match
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1131	#' }
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1132	#' @export
				1133	setMethod("fetchAnnotations", "KorAPQuery", function(kqo, foundry = "tt", verbose = kqo@korapConnection@verbose) {
				1134	if (is.null(kqo@collectedMatches) \|\| nrow(kqo@collectedMatches) == 0) {
				1135	warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
				1136	return(kqo)
				1137	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1138
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1139	df <- kqo@collectedMatches
				1140	kco <- kqo@korapConnection
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1141
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1142	# Initialize annotation columns as data frames (like tokens field)
				1143	# Create the structure more explicitly to avoid assignment issues
				1144	nrows <- nrow(df)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1145
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1146	df$pos <- data.frame(
				1147	left = I(replicate(nrows, character(0), simplify = FALSE)),
				1148	match = I(replicate(nrows, character(0), simplify = FALSE)),
				1149	right = I(replicate(nrows, character(0), simplify = FALSE)),
				1150	stringsAsFactors = FALSE
				1151	)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1152
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1153	df$lemma <- data.frame(
				1154	left = I(replicate(nrows, character(0), simplify = FALSE)),
				1155	match = I(replicate(nrows, character(0), simplify = FALSE)),
				1156	right = I(replicate(nrows, character(0), simplify = FALSE)),
				1157	stringsAsFactors = FALSE
				1158	)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1159
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1160	df$morph <- data.frame(
				1161	left = I(replicate(nrows, character(0), simplify = FALSE)),
				1162	match = I(replicate(nrows, character(0), simplify = FALSE)),
				1163	right = I(replicate(nrows, character(0), simplify = FALSE)),
				1164	stringsAsFactors = FALSE
				1165	)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1166
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1167	df$atokens <- data.frame(
				1168	left = I(replicate(nrows, character(0), simplify = FALSE)),
				1169	match = I(replicate(nrows, character(0), simplify = FALSE)),
				1170	right = I(replicate(nrows, character(0), simplify = FALSE)),
				1171	stringsAsFactors = FALSE
				1172	)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1173
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1174	df$annotation_snippet <- replicate(nrows, NA, simplify = FALSE)
				1175
Marc Kupietz	e8c0fef	2025-07-18 19:59:04 +0200	[diff] [blame]	1176	# Initialize timing for ETA calculation
				1177	start_time <- Sys.time()
				1178	if (verbose) {
				1179	log_info(verbose, paste("Starting to fetch annotations for", nrows, "matches\n"))
				1180	}
				1181
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1182	for (i in seq_len(nrow(df))) {
Marc Kupietz	e8c0fef	2025-07-18 19:59:04 +0200	[diff] [blame]	1183	# ETA logging
				1184	if (verbose && i > 1) {
				1185	eta_info <- calculate_eta(i, nrows, start_time)
				1186	log_info(verbose, paste("Fetching annotations for match", i, "of", nrows, eta_info, "\n"))
				1187	}
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1188	# Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
				1189	if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1190	# matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
				1191	# "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
				1192	# Extract document path and position, handling both regular and encrypted formats
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1193
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1194	# More flexible regex to extract the document path with position and encryption
				1195	# Look for pattern: match-(...)-p(\d+)-(\d+)(.) where (.) is the encrypted part
				1196	# We need to capture the entire path including the encrypted suffix
				1197	match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1198
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1199	if (match_result > 0) {
				1200	# Extract the complete path including encryption (everything after "match-")
				1201	doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
				1202	# Convert the dash before position to slash, but keep everything after the position
				1203	match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
Marc Kupietz	2512130	2025-07-19 08:45:43 +0200	[diff] [blame]	1204	# Use httr2 to construct URL safely
				1205	base_url <- paste0(kco@apiUrl, "corpus/", match_path)
				1206	req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1207	} else {
Marc Kupietz	2512130	2025-07-19 08:45:43 +0200	[diff] [blame]	1208	# If regex fails, fall back to the old method with httr2
				1209	# Format numbers to avoid scientific notation
				1210	match_start <- format(df$matchStart[i], scientific = FALSE)
				1211	match_end <- format(df$matchEnd[i], scientific = FALSE)
				1212	base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
				1213	req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1214	}
				1215	} else {
Marc Kupietz	2512130	2025-07-19 08:45:43 +0200	[diff] [blame]	1216	# Fallback to the old method with httr2
				1217	# Format numbers to avoid scientific notation
				1218	match_start <- format(df$matchStart[i], scientific = FALSE)
				1219	match_end <- format(df$matchEnd[i], scientific = FALSE)
				1220	base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
				1221	req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1222	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1223
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1224	tryCatch({
				1225	res <- apiCall(kco, req)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1226
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1227	if (!is.null(res)) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1228	# Store the raw annotation snippet
				1229	df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
				1230
				1231	# Parse XML annotations if snippet is available
				1232	if (is.list(res) && "snippet" %in% names(res)) {
				1233	parsed_annotations <- parse_xml_annotations_structured(res$snippet)
				1234
				1235	# Store the parsed linguistic data in data frame format (like tokens)
				1236	# Use individual assignment to avoid data frame mismatch errors
				1237	tryCatch({
				1238	# Assign POS annotations
				1239	df$pos$left[i] <- list(parsed_annotations$pos$left)
				1240	df$pos$match[i] <- list(parsed_annotations$pos$match)
				1241	df$pos$right[i] <- list(parsed_annotations$pos$right)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1242
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1243	# Assign lemma annotations
				1244	df$lemma$left[i] <- list(parsed_annotations$lemma$left)
				1245	df$lemma$match[i] <- list(parsed_annotations$lemma$match)
				1246	df$lemma$right[i] <- list(parsed_annotations$lemma$right)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1247
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1248	# Assign morphology annotations
				1249	df$morph$left[i] <- list(parsed_annotations$morph$left)
				1250	df$morph$match[i] <- list(parsed_annotations$morph$match)
				1251	df$morph$right[i] <- list(parsed_annotations$morph$right)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1252
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1253	# Assign token annotations
				1254	df$atokens$left[i] <- list(parsed_annotations$atokens$left)
				1255	df$atokens$match[i] <- list(parsed_annotations$atokens$match)
				1256	df$atokens$right[i] <- list(parsed_annotations$atokens$right)
				1257	}, error = function(assign_error) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1258	# Set empty character vectors on assignment error using list assignment
				1259	df$pos$left[i] <<- list(character(0))
				1260	df$pos$match[i] <<- list(character(0))
				1261	df$pos$right[i] <<- list(character(0))
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1262
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1263	df$lemma$left[i] <<- list(character(0))
				1264	df$lemma$match[i] <<- list(character(0))
				1265	df$lemma$right[i] <<- list(character(0))
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1266
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1267	df$morph$left[i] <<- list(character(0))
				1268	df$morph$match[i] <<- list(character(0))
				1269	df$morph$right[i] <<- list(character(0))
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1270
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1271	df$atokens$left[i] <<- list(character(0))
				1272	df$atokens$match[i] <<- list(character(0))
				1273	df$atokens$right[i] <<- list(character(0))
				1274	})
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1275	} else {
				1276	# No snippet available, store empty vectors
				1277	df$pos$left[i] <- list(character(0))
				1278	df$pos$match[i] <- list(character(0))
				1279	df$pos$right[i] <- list(character(0))
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1280
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1281	df$lemma$left[i] <- list(character(0))
				1282	df$lemma$match[i] <- list(character(0))
				1283	df$lemma$right[i] <- list(character(0))
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1284
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1285	df$morph$left[i] <- list(character(0))
				1286	df$morph$match[i] <- list(character(0))
				1287	df$morph$right[i] <- list(character(0))
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1288
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1289	df$atokens$left[i] <- list(character(0))
				1290	df$atokens$match[i] <- list(character(0))
				1291	df$atokens$right[i] <- list(character(0))
				1292	}
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1293	} else {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1294	# Store NAs for failed requests
				1295	df$pos$left[i] <- list(NA)
				1296	df$pos$match[i] <- list(NA)
				1297	df$pos$right[i] <- list(NA)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1298
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1299	df$lemma$left[i] <- list(NA)
				1300	df$lemma$match[i] <- list(NA)
				1301	df$lemma$right[i] <- list(NA)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1302
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1303	df$morph$left[i] <- list(NA)
				1304	df$morph$match[i] <- list(NA)
				1305	df$morph$right[i] <- list(NA)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1306
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1307	df$atokens$left[i] <- list(NA)
				1308	df$atokens$match[i] <- list(NA)
				1309	df$atokens$right[i] <- list(NA)
				1310	df$annotation_snippet[[i]] <- NA
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1311	}
				1312	}, error = function(e) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1313	# Store NAs for failed requests
				1314	df$pos$left[i] <- list(NA)
				1315	df$pos$match[i] <- list(NA)
				1316	df$pos$right[i] <- list(NA)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1317
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1318	df$lemma$left[i] <- list(NA)
				1319	df$lemma$match[i] <- list(NA)
				1320	df$lemma$right[i] <- list(NA)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1321
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1322	df$morph$left[i] <- list(NA)
				1323	df$morph$match[i] <- list(NA)
				1324	df$morph$right[i] <- list(NA)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1325
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1326	df$atokens$left[i] <- list(NA)
				1327	df$atokens$match[i] <- list(NA)
				1328	df$atokens$right[i] <- list(NA)
				1329	df$annotation_snippet[[i]] <- NA
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1330	})
				1331	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1332
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1333	# Validate data frame structure before assignment
				1334	if (nrow(df) != nrow(kqo@collectedMatches)) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1335	}
				1336
				1337	# Update the collectedMatches with annotation data
				1338	tryCatch({
				1339	kqo@collectedMatches <- df
				1340	}, error = function(assign_error) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1341	# Try a safer approach: add columns individually
				1342	tryCatch({
				1343	kqo@collectedMatches$pos <- df$pos
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1344	kqo@collectedMatches$lemma <- df$lemma
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1345	kqo@collectedMatches$morph <- df$morph
				1346	kqo@collectedMatches$atokens <- df$atokens
				1347	kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
				1348	}, error = function(col_error) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1349	warning("Failed to add annotation data to collectedMatches")
				1350	})
				1351	})
				1352
Marc Kupietz	e8c0fef	2025-07-18 19:59:04 +0200	[diff] [blame]	1353	if (verbose) {
				1354	elapsed_time <- Sys.time() - start_time
				1355	log_info(verbose, paste("Finished fetching annotations for", nrows, "matches in", format_duration(as.numeric(elapsed_time, units = "secs")), "\n"))
				1356	}
				1357
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1358	return(kqo)
				1359	})
				1360
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1361	#' Query frequencies of search expressions in virtual corpora
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1362	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	1363	#' `frequencyQuery` combines [corpusQuery()], [corpusStats()] and
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1364	#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1365	#' confidence intervals of one ore multiple search terms across one or multiple
				1366	#' virtual corpora.
				1367	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	1368	#' @family frequency analysis
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1369	#' @aliases frequencyQuery
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1370	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	1371	#' \dontrun{
				1372	#'
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1373	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1374	#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	1375	#' }
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1376	#'
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1377	# @inheritParams corpusQuery
Marc Kupietz	617266d	2025-02-27 10:43:07 +0100	[diff] [blame]	1378	#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1379	#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
				1380	#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	1381	#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
				1382	#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1383	#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1384	#' @export
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1385	#'
				1386	#' @return A tibble, with each row containing the following result columns for query and vc combinations:
				1387	#' - query: the query string used for the frequency analysis.
				1388	#' - totalResults: absolute frequency of query matches in the vc.
				1389	#' - vc: virtual corpus used for the query.
				1390	#' - webUIRequestUrl: URL of the corresponding web UI request with respect to query and vc.
				1391	#' - total: total number of words in vc.
				1392	#' - f: relative frequency of query matches in the vc.
				1393	#' - conf.low: lower bound of the confidence interval for the relative frequency, given `conf.level`.
				1394	#' - conf.high: upper bound of the confidence interval for the relative frequency, given `conf.level`.
				1395
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1396	setMethod(
				1397	"frequencyQuery", "KorAPConnection",
Marc Kupietz	71d6e05	2019-11-22 18:42:10 +0100	[diff] [blame]	1398	function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1399	(if (as.alternatives) {
				1400	corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) \|>
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	1401	group_by(vc) \|>
Marc Kupietz	71d6e05	2019-11-22 18:42:10 +0100	[diff] [blame]	1402	mutate(total = sum(totalResults))
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1403	} else {
				1404	corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) \|>
				1405	mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	1406	}) \|>
Marc Kupietz	0c29cea	2019-10-09 08:44:36 +0200	[diff] [blame]	1407	ci(conf.level = conf.level)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1408	}
				1409	)
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1410
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1411	#' buildWebUIRequestUrlFromString
				1412	#'
				1413	#' @rdname KorAPQuery-class
				1414	#' @importFrom urltools url_encode
				1415	#' @export
				1416	buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1417	query,
				1418	vc = "",
				1419	ql = "poliqarp") {
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1420	if ("KorAPConnection" %in% class(KorAPUrl)) {
				1421	KorAPUrl <- KorAPUrl@KorAPUrl
				1422	}
				1423
				1424	request <-
				1425	paste0(
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1426	"?q=",
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1427	urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1428	ifelse(vc != "",
				1429	paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
				1430	""
				1431	),
				1432	"&ql=",
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1433	ql
				1434	)
				1435	paste0(KorAPUrl, request)
				1436	}
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1437
				1438	#' buildWebUIRequestUrl
				1439	#'
				1440	#' @rdname KorAPQuery-class
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1441	#' @importFrom httr2 url_parse
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1442	#' @export
				1443	buildWebUIRequestUrl <- function(kco,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1444	query = if (missing(KorAPUrl)) {
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1445	stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1446	} else {
				1447	httr2::url_parse(KorAPUrl)$query$q
				1448	},
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1449	vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1450	KorAPUrl,
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1451	ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1452	buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1453	}
				1454
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1455	#' format()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1456	#' @rdname KorAPQuery-class
				1457	#' @param x KorAPQuery object
				1458	#' @param ... further arguments passed to or from other methods
Marc Kupietz	b73ca0f	2025-01-28 20:45:01 +0100	[diff] [blame]	1459	#' @importFrom urltools param_get url_decode
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1460	#' @export
				1461	format.KorAPQuery <- function(x, ...) {
				1462	cat("<KorAPQuery>\n")
				1463	q <- x
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1464	param <- urltools::param_get(q@request) \|> lapply(urltools::url_decode)
Marc Kupietz	b73ca0f	2025-01-28 20:45:01 +0100	[diff] [blame]	1465	cat(" Query: ", param$q, "\n")
				1466	if (!is.null(param$cq) && param$cq != "") {
				1467	cat(" Virtual corpus: ", param$cq, "\n")
				1468	}
				1469	if (!is.null(q@collectedMatches)) {
				1470	cat("==============================================================================================================", "\n")
				1471	print(summary(q@collectedMatches))
				1472	cat("==============================================================================================================", "\n")
				1473	}
				1474	cat(" Total results: ", q@totalResults, "\n")
				1475	cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1476	if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
				1477	successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
				1478	parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
				1479	cat(" Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
				1480	if (parsed_annotations > 0) {
				1481	cat(" (", parsed_annotations, " with parsed linguistic data)")
				1482	}
				1483	cat("\n")
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1484	}
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1485	}
				1486
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1487	#' show()
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1488	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1489	#' @rdname KorAPQuery-class
				1490	#' @param object KorAPQuery object
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1491	#' @export
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1492	setMethod("show", "KorAPQuery", function(object) {
				1493	format(object)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1494	invisible(object)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1495	})