Blame - R/KorAPQuery.R - KorAP/RKorAPClient

blob: 1afb122966953aa9707d5c67b83bcc786dd2c7ac [file] [log] [blame]

Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	1	#' KorAPQuery class (internal)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	2	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	3	#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	4	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	5	#' @keywords internal
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	6	#' @include KorAPConnection.R
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	7	#' @include logging.R
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	8	#' @import httr2
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	9	#'
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	10	#' @include RKorAPClient-package.R
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	11
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	12	#' @export
				13	KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietz	b897218	2019-09-20 21:33:46 +0200	[diff] [blame]	14	"korapConnection",
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	15	"request",
				16	"vc",
				17	"totalResults",
				18	"nextStartIndex",
				19	"fields",
				20	"requestUrl",
				21	"webUIRequestUrl",
				22	"apiResponse",
				23	"collectedMatches",
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	24	"hasMoreMatches"
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	25	))
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	26
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	27	#' Initialize KorAPQuery object
				28	#' @keywords internal
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	29	#' @param .Object …
Marc Kupietz	b897218	2019-09-20 21:33:46 +0200	[diff] [blame]	30	#' @param korapConnection KorAPConnection object
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	31	#' @param request query part of the request URL
				32	#' @param vc definition of a virtual corpus
				33	#' @param totalResults number of hits the query has yielded
				34	#' @param nextStartIndex at what index to start the next fetch of query results
				35	#' @param fields what data / metadata fields should be collected
				36	#' @param requestUrl complete URL of the API request
				37	#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
				38	#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz	7776dec	2019-09-27 16:59:02 +0200	[diff] [blame]	39	#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	40	#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz	97a1bca	2019-10-04 22:52:09 +0200	[diff] [blame]	41	#'
				42	#' @importFrom tibble tibble
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	43	#' @export
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	44	setMethod(
				45	"initialize", "KorAPQuery",
				46	function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
				47	"corpusSigle", "textSigle", "pubDate", "pubPlace",
				48	"availability", "textClass", "snippet", "tokens"
				49	),
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	50	requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	51	.Object <- callNextMethod()
				52	.Object@korapConnection <- korapConnection
				53	.Object@request <- request
				54	.Object@vc <- vc
				55	.Object@totalResults <- totalResults
				56	.Object@nextStartIndex <- nextStartIndex
				57	.Object@fields <- fields
				58	.Object@requestUrl <- requestUrl
				59	.Object@webUIRequestUrl <- webUIRequestUrl
				60	.Object@apiResponse <- apiResponse
				61	.Object@hasMoreMatches <- hasMoreMatches
				62	.Object@collectedMatches <- collectedMatches
				63	.Object
				64	}
				65	)
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	66
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	67	setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
				68	setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
				69	setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
				70	setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	71	setGeneric(
				72	"fetchAnnotations",
				73	function(kqo,
				74	foundry = "tt",
				75	overwrite = FALSE,
				76	verbose = kqo@korapConnection@verbose) standardGeneric("fetchAnnotations")
				77	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	78	setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	79
				80	maxResultsPerPage <- 50
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	81
Marc Kupietz	4de53ec	2019-10-04 09:12:00 +0200	[diff] [blame]	82	## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietz	ef1ef4a	2025-02-19 12:12:40 +0100	[diff] [blame]	83	utils::globalVariables(c("."))
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	84
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	85	#' Search corpus for query terms
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	86	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	87	#' `corpusQuery` performs a corpus query via a connection to a KorAP-API-server
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	88	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	89	#' @family corpus search functions
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	90	#' @aliases corpusQuery
				91	#'
				92	#' @importFrom urltools url_encode
				93	#' @importFrom purrr pmap
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	94	#' @importFrom dplyr bind_rows group_by
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	95	#'
Marc Kupietz	617266d	2025-02-27 10:43:07 +0100	[diff] [blame]	96	#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	97	#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	98	#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	99	#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz	132f005	2023-04-16 14:23:05 +0200	[diff] [blame]	100	#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
				101	#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
				102	#' your RKorAPClient application as explained in the
				103	#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
				104	#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
				105	#' `FALSE`.
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	106	#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	107	#' @param fields character vector specifying which metadata fields to retrieve for each match.
				108	#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
				109	#' \describe{
				110	#' \item{Text identification:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
				111	#' \item{Publication info:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
				112	#' \item{Temporal data:}{`pubDate`, `creationDate` - when text was published/created}
				113	#' \item{Publication details:}{`pubPlace`, `publisher`, `reference` - where/how published}
				114	#' \item{Text classification:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
				115	#' \item{Adminstrative and technical info:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
				116	#' \item{Content data:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
				117	#' \item{System data:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
				118	#' }
				119	#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
				120	#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz	43a6ade	2020-02-18 17:01:44 +0100	[diff] [blame]	121	#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz	25aebc3	2019-09-16 18:40:50 +0200	[diff] [blame]	122	#' @param verbose print some info
Marc Kupietz	4de53ec	2019-10-04 09:12:00 +0200	[diff] [blame]	123	#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	124	#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietz	d9b2fd7	2023-04-17 19:08:50 +0200	[diff] [blame]	125	#' @param context string that specifies the size of the left and the right context returned in `snippet`
				126	#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
				127	#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
				128	#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
				129	#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
				130	#' as this is typically legally motivated.
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	131	#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	132	#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
				133	#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	134	#'
				135	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	136	#' \dontrun{
				137	#'
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	138	#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	139	#' KorAPConnection() \|>
				140	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	141	#' fetchAll()
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	142	#'
				143	#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	144	#' query <- KorAPConnection() \|>
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	145	#' corpusQuery("Ameisenplage",
				146	#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
				147	#' results <- fetchAll(query)
				148	#' results@collectedMatches
Marc Kupietz	657d8e7	2020-02-25 18:31:50 +0100	[diff] [blame]	149	#' }
Marc Kupietz	3c531f6	2019-09-13 12:17:24 +0200	[diff] [blame]	150	#'
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	151	#' \dontrun{
				152	#'
Marc Kupietz	603491f	2019-09-18 14:01:02 +0200	[diff] [blame]	153	#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
				154	#' # and show the number of query hits (but don't fetch them).
Marc Kupietz	69cc54a	2019-09-30 12:06:54 +0200	[diff] [blame]	155	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	156	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	157	#' corpusQuery(
				158	#' KorAPUrl =
				159	#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
				160	#' )
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	161	#' }
				162	#'
				163	#' \dontrun{
Marc Kupietz	3c531f6	2019-09-13 12:17:24 +0200	[diff] [blame]	164	#'
Marc Kupietz	603491f	2019-09-18 14:01:02 +0200	[diff] [blame]	165	#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	166	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	167	#' {
				168	#' . ->> kco
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	169	#' } \|>
				170	#' corpusQuery("Ameisenplage") \|>
				171	#' fetchAll() \|>
				172	#' slot("collectedMatches") \|>
				173	#' mutate(year = lubridate::year(pubDate)) \|>
				174	#' dplyr::select(year) \|>
				175	#' group_by(year) \|>
				176	#' summarise(Count = dplyr::n()) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	177	#' mutate(Freq = mapply(function(f, y) {
				178	#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	179	#' }, Count, year)) \|>
				180	#' dplyr::select(-Count) \|>
				181	#' complete(year = min(year):max(year), fill = list(Freq = 0)) \|>
Marc Kupietz	69cc54a	2019-09-30 12:06:54 +0200	[diff] [blame]	182	#' plot(type = "l")
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	183	#' }
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	184	#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	185	#'
				186	#' @references
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	187	#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	188	#'
				189	#' @export
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	190	setMethod(
				191	"corpusQuery", "KorAPConnection",
				192	function(kco,
				193	query = if (missing(KorAPUrl)) {
				194	stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
				195	} else {
				196	httr2::url_parse(KorAPUrl)$query$q
				197	},
				198	vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
				199	KorAPUrl,
				200	metadataOnly = TRUE,
				201	ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
				202	fields = c(
				203	"corpusSigle",
				204	"textSigle",
				205	"pubDate",
				206	"pubPlace",
				207	"availability",
				208	"textClass",
				209	"snippet",
				210	"tokens"
				211	),
				212	accessRewriteFatal = TRUE,
				213	verbose = kco@verbose,
				214	expand = length(vc) != length(query),
				215	as.df = FALSE,
				216	context = NULL) {
				217	if (length(query) > 1 \|\| length(vc) > 1) {
				218	grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	219
				220	# Initialize timing variables for ETA calculation
				221	total_queries <- nrow(grid)
				222	current_query <- 0
				223	start_time <- Sys.time()
				224
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	225	results <- purrr::pmap(grid, function(query, vc, ...) {
				226	current_query <<- current_query + 1
				227
				228	# Execute the single query directly (avoiding recursive call)
				229	contentFields <- c("snippet", "tokens")
				230	query_fields <- fields
				231	if (metadataOnly) {
				232	query_fields <- query_fields[!query_fields %in% contentFields]
				233	}
				234	if (!"textSigle" %in% query_fields) {
				235	query_fields <- c(query_fields, "textSigle")
				236	}
				237	request <-
				238	paste0(
				239	"?q=",
				240	url_encode(enc2utf8(query)),
				241	ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
				242	ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
				243	ifelse(!metadataOnly, "&show-tokens=true", ""),
				244	"&ql=", ql
				245	)
				246	webUIRequestUrl <- paste0(kco@KorAPUrl, request)
				247	requestUrl <- paste0(
				248	kco@apiUrl,
				249	"search",
				250	request,
				251	"&fields=",
				252	paste(query_fields, collapse = ","),
				253	if (metadataOnly) "&access-rewrite-disabled=true" else ""
				254	)
				255
				256	# Show individual query progress
				257	log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
				258	res <- apiCall(kco, paste0(requestUrl, "&count=0"))
				259	if (is.null(res)) {
				260	log_info(verbose, ": API call failed\n")
				261	totalResults <- 0
				262	} else {
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame]	263	# Check for query rewrites and warn the user
				264	warnOnRewrites(res)
				265
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	266	totalResults <- as.integer(res$meta$totalResults)
				267	log_info(verbose, ": ", totalResults, " hits")
				268	if (!is.null(res$meta$cached)) {
				269	log_info(verbose, " [cached]")
				270	} else if (!is.null(res$meta$benchmark)) {
				271	if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
				272	time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
				273	formatted_time <- paste0(round(time_value, 2), "s")
				274	log_info(verbose, ", took ", formatted_time)
				275	} else {
				276	log_info(verbose, ", took ", res$meta$benchmark)
				277	}
				278	}
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	279
				280	# Calculate and display ETA information on the same line if verbose and we have more than one query
				281	if (verbose && total_queries > 1) {
				282	eta_info <- calculate_eta(current_query, total_queries, start_time)
				283	if (eta_info != "") {
				284	elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
				285	avg_time_per_query <- elapsed_time / current_query
				286
				287	# Add ETA info to the same line - remove the leading ". " for cleaner formatting
				288	clean_eta_info <- sub("^\\. ", ". ", eta_info)
				289	log_info(verbose, clean_eta_info)
				290	}
				291	}
				292
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	293	log_info(verbose, "\n")
				294	}
				295
				296	result <- data.frame(
				297	query = query,
				298	totalResults = totalResults,
				299	vc = vc,
				300	webUIRequestUrl = webUIRequestUrl,
				301	stringsAsFactors = FALSE
				302	)
				303
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	304	return(result)
				305	})
				306
				307	results %>% bind_rows()
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	308	} else {
Marc Kupietz	2078bde	2023-08-27 16:46:15 +0200	[diff] [blame]	309	contentFields <- c("snippet", "tokens")
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	310	if (metadataOnly) {
				311	fields <- fields[!fields %in% contentFields]
				312	}
Marc Kupietz	80dc643	2025-02-07 16:57:40 +0100	[diff] [blame]	313	if (!"textSigle" %in% fields) {
				314	fields <- c(fields, "textSigle")
				315	}
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	316	request <-
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	317	paste0(
				318	"?q=",
				319	url_encode(enc2utf8(query)),
				320	ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
				321	ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
				322	ifelse(!metadataOnly, "&show-tokens=true", ""),
				323	"&ql=", ql
				324	)
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	325	webUIRequestUrl <- paste0(kco@KorAPUrl, request)
				326	requestUrl <- paste0(
				327	kco@apiUrl,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	328	"search",
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	329	request,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	330	"&fields=",
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	331	paste(fields, collapse = ","),
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	332	if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	333	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	334	log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
				335	sep =
				336	""
				337	)
				338	res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	339	if (is.null(res)) {
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	340	message("API call failed.")
				341	totalResults <- 0
				342	} else {
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame]	343	# Check for query rewrites and warn the user
				344	warnOnRewrites(res)
				345
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	346	totalResults <- as.integer(res$meta$totalResults)
Marc Kupietz	a47d150	2023-04-18 15:26:47 +0200	[diff] [blame]	347	log_info(verbose, ": ", totalResults, " hits")
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	348	if (!is.null(res$meta$cached)) {
Marc Kupietz	a47d150	2023-04-18 15:26:47 +0200	[diff] [blame]	349	log_info(verbose, " [cached]\n")
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	350	} else if (!is.null(res$meta$benchmark)) {
Marc Kupietz	2baf5c5	2025-09-05 16:41:11 +0200	[diff] [blame]	351	# Round the benchmark time to 2 decimal places for better readability.
				352	# Be robust to locales using comma as decimal separator (e.g., "0,12s").
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	353	if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
Marc Kupietz	2baf5c5	2025-09-05 16:41:11 +0200	[diff] [blame]	354	bench_str <- sub("s$", "", res$meta$benchmark)
				355	bench_num <- suppressWarnings(as.numeric(gsub(",", ".", bench_str)))
				356	if (!is.na(bench_num)) {
				357	formatted_time <- paste0(round(bench_num, 2), "s")
				358	} else {
				359	formatted_time <- res$meta$benchmark
				360	}
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	361	log_info(verbose, ", took ", formatted_time, "\n", sep = "")
				362	} else {
				363	# Fallback if the format is different than expected
				364	log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
				365	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	366	} else {
				367	log_info(verbose, "\n")
				368	}
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	369	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	370	if (as.df) {
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	371	data.frame(
				372	query = query,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	373	totalResults = totalResults,
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	374	vc = vc,
				375	webUIRequestUrl = webUIRequestUrl,
				376	stringsAsFactors = FALSE
				377	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	378	} else {
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	379	KorAPQuery(
				380	korapConnection = kco,
				381	nextStartIndex = 0,
				382	fields = fields,
				383	requestUrl = requestUrl,
				384	request = request,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	385	totalResults = totalResults,
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	386	vc = vc,
				387	apiResponse = res,
				388	webUIRequestUrl = webUIRequestUrl,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	389	hasMoreMatches = (totalResults > 0),
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	390	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	391	}
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	392	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	393	}
				394	)
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	395
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame]	396	# Helper function to check if a query rewrite warning should be shown
				397	warnOnRewrites <- function(res) {
				398	if (!is.null(res$collection$rewrites)) {
				399	comment <- res$collection$rewrites$`_comment`
				400	# Only show warning if it's not just the standard policy message
				401	if (!is.null(comment) && comment != "All corpus access policy has been added.") {
				402	warning(res$collection$rewrites$editor, " had to rewrite your query: ", comment)
				403	}
				404	}
				405	}
				406
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	407	#' @importFrom purrr map
				408	repair_data_strcuture <- function(x) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	409	if (is.list(x)) {
				410	as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	411	paste(.x, collapse = " ")
				412	} else {
				413	.x
				414	}))
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	415	} else {
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	416	ifelse(is.na(x), "", x)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	417	}
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	418	}
				419
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	420	#' Fetch the next bunch of results of a KorAP query.
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	421	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	422	#' `fetchNext` fetches the next bunch of results of a KorAP query.
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	423	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	424	#' @family corpus search functions
				425	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	426	#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	427	#' @param offset start offset for query results to fetch
				428	#' @param maxFetch maximum number of query results to fetch
Marc Kupietz	25aebc3	2019-09-16 18:40:50 +0200	[diff] [blame]	429	#' @param verbose print progress information if true
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	430	#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
				431	#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	432	#'
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	433	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	434	#' \dontrun{
				435	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	436	#' q <- KorAPConnection() \|>
				437	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	438	#' fetchNext()
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	439	#' q@collectedMatches
Marc Kupietz	657d8e7	2020-02-25 18:31:50 +0100	[diff] [blame]	440	#' }
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	441	#'
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	442	#' @references
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	443	#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	444	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	445	#' @aliases fetchNext
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	446	#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	447	#' @importFrom tibble enframe add_column
				448	#' @importFrom stringr word
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	449	#' @importFrom tidyr unnest unchop pivot_wider
				450	#' @importFrom purrr map
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	451	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	452	setMethod("fetchNext", "KorAPQuery", function(kqo,
				453	offset = kqo@nextStartIndex,
				454	maxFetch = maxResultsPerPage,
				455	verbose = kqo@korapConnection@verbose,
				456	randomizePageOrder = FALSE) {
Marc Kupietz	a7a8f1b	2024-12-18 15:56:19 +0100	[diff] [blame]	457	# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	458	results <- key <- name <- tmp_positions <- 0
Marc Kupietz	a7a8f1b	2024-12-18 15:56:19 +0100	[diff] [blame]	459
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	460	if (kqo@totalResults == 0 \|\| offset >= kqo@totalResults) {
				461	return(kqo)
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	462	}
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	463	use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	464	# Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	465	collectedMatches <- kqo@collectedMatches
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	466
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	467	# Track start time for ETA calculation
				468	start_time <- Sys.time()
				469
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	470	# For randomized page order, generate a list of randomized page indices
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	471	if (randomizePageOrder) {
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	472	# Calculate how many pages we need to fetch based on maxFetch
				473	total_pages_to_fetch <- if (!is.na(maxFetch)) {
				474	# Either limited by maxFetch or total results, whichever is smaller
				475	min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
				476	} else {
				477	# All pages
				478	ceiling(kqo@totalResults / maxResultsPerPage)
				479	}
				480
				481	# Generate randomized page indices (0-based for API)
				482	pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
				483	page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	484	}
				485
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	486	if (is.null(collectedMatches)) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	487	collectedMatches <- data.frame()
				488	}
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	489
				490	# Initialize the page counter properly based on nextStartIndex and any previously fetched results
				491	# We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
				492	# For first call, this will be 1, for subsequent calls, it will reflect our actual position
				493	current_page_number <- ceiling(offset / maxResultsPerPage) + 1
				494
				495	# For sequential fetches, keep track of which global page we're on
				496	# This is important for correctly showing page numbers in subsequent fetchNext calls
				497	page_count_start <- current_page_number
				498
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	499	repeat {
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	500	# Determine which page to fetch next
				501	if (randomizePageOrder) {
				502	# In randomized mode, get the page from our randomized list using the page_index
				503	# Make sure we don't exceed the array bounds
				504	if (page_index > length(pages)) {
				505	break # No more pages to fetch in randomized mode
				506	}
				507	current_offset_page <- pages[page_index]
				508	# For display purposes in randomized mode, show which page out of the total we're fetching
				509	display_page_number <- page_index
				510	} else {
				511	# In sequential mode, use the current_page_number to calculate the offset
				512	current_offset_page <- (current_page_number - 1)
				513	display_page_number <- current_page_number
				514	}
				515
				516	# Calculate the actual offset in tokens
				517	currentOffset <- current_offset_page * maxResultsPerPage
				518
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	519	# Build the query with the appropriate count and offset using httr2
				520	count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	521
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	522	# Parse existing URL to preserve all query parameters
				523	parsed_url <- httr2::url_parse(kqo@requestUrl)
				524	existing_query <- parsed_url$query
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	525
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	526	# Add/update count and offset parameters
				527	existing_query$count <- count_param
				528	existing_query$offset <- currentOffset
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	529
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	530	# Rebuild the URL with all parameters
				531	query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame]	532
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	533	res <- apiCall(kqo@korapConnection, query)
				534	if (length(res$matches) == 0) {
				535	break
				536	}
				537
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame]	538	# Check for query rewrites and warn the user
				539	warnOnRewrites(res)
				540
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	541	if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) \|\| as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz	16ccf11	2025-01-26 13:25:27 +0100	[diff] [blame]	542	log_info(verbose, "Using fields API: ")
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	543	currentMatches <- res$matches$fields %>%
				544	purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
				545	tibble::enframe() %>%
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	546	tidyr::unnest(cols = value) %>%
				547	tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	548	dplyr::select(-name)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	549	if ("snippet" %in% colnames(res$matches)) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	550	currentMatches$snippet <- res$matches$snippet
				551	}
Marc Kupietz	3cd2c6c	2025-01-08 20:35:39 +0100	[diff] [blame]	552	if ("tokens" %in% colnames(res$matches)) {
				553	currentMatches$tokens <- res$matches$tokens
				554	}
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	555	} else {
				556	currentMatches <- res$matches
				557	}
				558
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	559	for (field in kqo@fields) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	560	if (!field %in% colnames(currentMatches)) {
				561	currentMatches[, field] <- NA
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	562	}
				563	}
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	564	currentMatches <- currentMatches %>%
				565	select(kqo@fields) %>%
				566	mutate(
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	567	matchID = res$matches$matchID,
Marc Kupietz	0447da0	2025-01-08 20:51:09 +0100	[diff] [blame]	568	tmp_positions = gsub(".-p(\\d+)-(\\d+).", "\\1 \\2", res$matches$matchID),
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	569	matchStart = as.integer(stringr::word(tmp_positions, 1)),
				570	matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
				571	) %>%
				572	select(-tmp_positions)
				573
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	574	if (!is.list(collectedMatches)) {
				575	collectedMatches <- currentMatches
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	576	} else {
Marc Kupietz	2078bde	2023-08-27 16:46:15 +0200	[diff] [blame]	577	collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	578	}
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	579
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame]	580
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	581	# Get the actual items per page from the API response
				582	# We now consistently use maxResultsPerPage instead
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	583
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	584	# Calculate total pages consistently using fixed maxResultsPerPage
				585	# This ensures consistent page counting across the function
				586	total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
				587
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	588	# Calculate ETA using the centralized function from logging.R
				589	current_page <- if (randomizePageOrder) page_index else display_page_number
				590	total_pages_to_fetch <- if (!is.na(maxFetch)) {
				591	# Account for offset - we can only fetch from the remaining results after offset
				592	remaining_results_after_offset <- max(0, kqo@totalResults - offset)
				593	min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
				594	} else {
				595	total_pages
				596	}
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	597
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	598	eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	599
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	600	# Extract timing information for display
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	601	time_per_page <- NA
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	602	if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	603	time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	604	}
				605
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	606	# Create the page display string with proper formatting
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	607
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	608	# For global page tracking, calculate the absolute page number
				609	actual_display_number <- if (randomizePageOrder) {
				610	current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
				611	} else {
				612	# In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
				613	current_offset_page + 1
				614	}
				615
				616	# For subsequent calls to fetchNext, we need to calculate the correct page numbers
				617	# based on the current batch being fetched
				618
				619	# For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
				620	# Simply count from 1 within the current batch
				621
				622	# The relative page number is simply the current position in this batch
				623	if (randomizePageOrder) {
				624	relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
				625	} else {
				626	relative_page_number <- display_page_number - (page_count_start - 1)
				627	}
				628
				629	# How many pages will we fetch in this batch?
Marc Kupietz	021663d	2025-06-18 17:49:22 +0200	[diff] [blame]	630	# If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	631	pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz	021663d	2025-06-18 17:49:22 +0200	[diff] [blame]	632	# Account for offset - we can only fetch from the remaining results after offset
				633	remaining_results_after_offset <- max(0, kqo@totalResults - offset)
				634	min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	635	} else {
				636	# Otherwise fetch all remaining pages
				637	total_pages - page_count_start + 1
				638	}
				639
				640	# The total pages to be shown in this batch
				641	batch_total_pages <- pages_in_this_batch
				642
				643	page_display <- paste0(
				644	"Retrieved page ",
				645	sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
				646	"/",
				647	sprintf("%d", batch_total_pages)
				648	)
				649
				650	# If randomized, also show which actual page we fetched
				651	if (randomizePageOrder) {
				652	# Determine the maximum width needed for page numbers (based on total pages)
				653	# This ensures consistent alignment
				654	max_page_width <- nchar(as.character(total_pages))
				655	# Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	656	page_display <- paste0(
				657	page_display,
				658	sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
				659	)
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	660	}
				661	# Always show the absolute page number and total pages (for clarity)
				662	else {
				663	# Show the absolute page number (out of total possible pages)
				664	page_display <- paste0(page_display, sprintf(
				665	" (page %d of %d total)",
				666	actual_display_number, total_pages
				667	))
				668	}
				669
				670	# Add caching or timing information
				671	if (!is.null(res$meta$cached)) {
				672	page_display <- paste0(page_display, " [cached]")
				673	} else {
				674	page_display <- paste0(
				675	page_display,
				676	" in ",
				677	if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	678	"s",
				679	eta_info
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	680	)
				681	}
				682
				683	log_info(verbose, paste0(page_display, "\n"))
				684
				685	# Increment the appropriate counter based on mode
				686	if (randomizePageOrder) {
				687	page_index <- page_index + 1
				688	} else {
				689	current_page_number <- current_page_number + 1
				690	}
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	691	results <- results + res$meta$itemsPerPage
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	692	if (nrow(collectedMatches) >= kqo@totalResults \|\| (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	693	break
				694	}
				695	}
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	696	nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	697	KorAPQuery(
				698	nextStartIndex = nextStartIndex,
Marc Kupietz	d0d3e9b	2019-09-24 17:36:03 +0200	[diff] [blame]	699	korapConnection = kqo@korapConnection,
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	700	fields = kqo@fields,
				701	requestUrl = kqo@requestUrl,
				702	request = kqo@request,
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	703	totalResults = kqo@totalResults,
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	704	vc = kqo@vc,
				705	webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	706	hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	707	apiResponse = res,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	708	collectedMatches = collectedMatches
				709	)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	710	})
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	711
				712	#' Fetch all results of a KorAP query.
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	713	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	714	#' `fetchAll` fetches all results of a KorAP query.
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	715	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	716	#' @family corpus search functions
Marc Kupietz	dc880ac	2025-06-24 20:34:43 +0200	[diff] [blame]	717	#' @param kqo object obtained from [corpusQuery()]
				718	#' @param verbose print progress information if true
				719	#' @param ... further arguments passed to [fetchNext()]
				720	#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	721	#'
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	722	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	723	#' \dontrun{
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	724	#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
				725	#' q <- KorAPConnection() \|>
				726	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	727	#' fetchAll()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	728	#' q@collectedMatches
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	729	#'
				730	#' # Fetch also all KWICs
				731	#' q <- KorAPConnection() \|> auth() \|>
				732	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
				733	#' fetchAll()
				734	#' q@collectedMatches
				735	#'
				736	#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
				737	#' q <- KorAPConnection() \|>
				738	#' corpusQuery("<base/s=t>", # this matches each text once
				739	#' vc = "pubDate in 1958-03-12",
				740	#' fields = c("textSigle", "title"),
				741	#' ) \|>
				742	#' fetchAll()
				743	#' q@collectedMatches
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	744	#' }
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	745	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	746	#' @aliases fetchAll
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	747	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	748	setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
				749	return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	750	})
				751
				752	#' Fetches the remaining results of a KorAP query.
				753	#'
Marc Kupietz	dc880ac	2025-06-24 20:34:43 +0200	[diff] [blame]	754	#' @param kqo object obtained from [corpusQuery()]
				755	#' @param verbose print progress information if true
				756	#' @param ... further arguments passed to [fetchNext()]
				757	#' @return The updated `kqo` object with remaining results in `@collectedMatches`
				758	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	759	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	760	#' \dontrun{
				761	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	762	#' q <- KorAPConnection() \|>
				763	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	764	#' fetchRest()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	765	#' q@collectedMatches
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	766	#' }
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	767	#'
				768	#' @aliases fetchRest
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	769	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	770	setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
				771	return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	772	})
				773
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	774	#'
				775	#' Parse XML annotations into linguistic layers
				776	#'
				777	#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
				778	#' from XML annotation snippets returned by the KorAP API.
				779	#'
				780	#' @param xml_snippet XML string containing annotation data
				781	#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
				782	#' @keywords internal
				783	parse_xml_annotations <- function(xml_snippet) {
				784	if (is.null(xml_snippet) \|\| is.na(xml_snippet) \|\| xml_snippet == "") {
				785	return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
				786	}
				787
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	788	extract_morph_via_xml <- function(fragment) {
				789	snippet <- paste0("<root>", fragment, "</root>")
				790	doc <- tryCatch(xml2::read_html(snippet), error = function(e) NULL)
				791	if (is.null(doc)) return(NULL)
				792
				793	nodes <- xml2::xml_find_all(doc, ".//span[not(.//span)]")
				794	if (length(nodes) == 0) return(list(tokens = character(0), morph = character(0)))
				795
				796	tokens_xml <- character(0)
				797	morph_vals <- character(0)
				798
				799	for (node in nodes) {
				800	token_text <- trimws(xml2::xml_text(node))
				801	if (identical(token_text, "")) next
				802
				803	tokens_xml <- c(tokens_xml, token_text)
				804
				805	ancestors <- xml2::xml_find_all(node, "ancestor-or-self::span")
				806	titles <- xml2::xml_attr(ancestors, "title")
				807	titles <- titles[!is.na(titles)]
				808
				809	feature_tokens <- character(0)
				810	if (length(titles) > 0) {
				811	bits <- unlist(strsplit(titles, "[[:space:]]+"))
				812	bits <- bits[grepl('/m:', bits)]
				813	if (length(bits) > 0) {
				814	feature_tokens <- sub('.?/m:(.)$', '\\1', bits, perl = TRUE)
				815	feature_tokens <- feature_tokens[!duplicated(feature_tokens)]
				816	}
				817	}
				818
				819	if (length(feature_tokens) == 0) {
				820	morph_vals <- c(morph_vals, NA_character_)
				821	} else {
				822	morph_vals <- c(morph_vals, paste(feature_tokens, collapse = "\|"))
				823	}
				824	}
				825
				826	list(tokens = tokens_xml, morph = morph_vals)
				827	}
				828
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	829	# Extract content within <span class="match">...</span> using a more robust approach
				830	if (grepl('<span class="match">', xml_snippet)) {
				831	# Find the start of match span
				832	start_pos <- regexpr('<span class="match">', xml_snippet)
				833	if (start_pos > 0) {
				834	# Find the end by counting nested spans
				835	content_start <- start_pos + attr(start_pos, "match.length")
				836	remaining <- substr(xml_snippet, content_start, nchar(xml_snippet))
				837
				838	# Simple approach: extract everything until we hit context-right or end
				839	if (grepl('<span class="context-right">', remaining)) {
				840	content_to_parse <- gsub('(.?)<span class="context-right">.', '\\1', remaining)
				841	} else {
				842	# Find the closing </span> that matches our opening span
				843	# For now, use a simpler approach - take everything until the last </span> sequence
				844	content_to_parse <- gsub('(.)</span>\\s$', '\\1', remaining)
				845	}
				846	} else {
				847	content_to_parse <- xml_snippet
				848	}
				849	} else {
				850	content_to_parse <- xml_snippet
				851	}
				852
				853	# Initialize result vectors
				854	tokens <- character(0)
				855	lemmas <- character(0)
				856	pos_tags <- character(0)
				857	morph_tags <- character(0)
				858
				859	# Split the content by </span> and process each meaningful part
				860	parts <- unlist(strsplit(content_to_parse, '</span>'))
				861
				862	for (part in parts) {
				863	part <- trimws(part)
				864	if (nchar(part) == 0) next
				865
				866	# Look for parts that have title attributes and end with text
				867	if (grepl('<span[^>]*title=', part)) {
				868	# Extract the text content (everything after the last >)
				869	text_content <- gsub('.>([^<])$', '\\1', part)
				870	text_content <- trimws(text_content)
				871
				872	if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
				873	tokens <- c(tokens, text_content)
				874
				875	# Extract all title attributes from this part
				876	title_pattern <- 'title="([^"]*)"'
				877	title_matches <- gregexpr(title_pattern, part)
				878
				879	lemma <- NA
				880	pos_tag <- NA
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	881	morph_features <- character(0)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	882
				883	if (title_matches[[1]][1] != -1) {
				884	all_titles <- regmatches(part, title_matches)[[1]]
				885	for (title_match in all_titles) {
				886	title_content <- gsub(title_pattern, '\\1', title_match)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	887
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	888	# Split by spaces and process each annotation
				889	annotations <- unlist(strsplit(title_content, "\\s+"))
				890	for (annotation in annotations) {
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	891	if (grepl('^[^/]+/l:', annotation)) {
				892	lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
				893	} else if (grepl('^[^/]+/p:', annotation)) {
				894	pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
				895	} else if (grepl('^[^/]+/m:', annotation)) {
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	896	morph_features <- c(morph_features, gsub('^[^/]+/m:(.*)$', '\\1', annotation))
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	897	}
				898	}
				899	}
				900	}
				901
				902	lemmas <- c(lemmas, lemma)
				903	pos_tags <- c(pos_tags, pos_tag)
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	904	morph_tag <- if (length(morph_features) > 0) {
				905	paste(unique(morph_features), collapse = "\|")
				906	} else {
				907	NA
				908	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	909	morph_tags <- c(morph_tags, morph_tag)
				910	}
				911	}
				912	}
				913
				914	# If no tokens found with the splitting approach, try a different method
				915	if (length(tokens) == 0) {
				916	# Look for the innermost spans that contain actual text
				917	innermost_pattern <- '<span[^>]title="([^"])"[^>]*>([^<]+)</span>'
				918	innermost_matches <- gregexpr(innermost_pattern, content_to_parse, perl = TRUE)
				919
				920	if (innermost_matches[[1]][1] != -1) {
				921	matches <- regmatches(content_to_parse, innermost_matches)[[1]]
				922
				923	for (match in matches) {
				924	title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
				925	text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
				926	text <- trimws(text)
				927
				928	if (nchar(text) > 0) {
				929	tokens <- c(tokens, text)
				930
				931	# Parse space-separated annotations in title
				932	lemma <- NA
				933	pos_tag <- NA
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	934	morph_features <- character(0)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	935
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	936	annotations <- unlist(strsplit(title, "\\s+"))
				937	for (annotation in annotations) {
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	938	if (grepl('^[^/]+/l:', annotation)) {
				939	lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
				940	} else if (grepl('^[^/]+/p:', annotation)) {
				941	pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
				942	} else if (grepl('^[^/]+/m:', annotation)) {
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	943	morph_features <- c(morph_features, gsub('^[^/]+/m:(.*)$', '\\1', annotation))
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	944	}
				945	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	946
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	947	lemmas <- c(lemmas, lemma)
				948	pos_tags <- c(pos_tags, pos_tag)
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	949	morph_tag <- if (length(morph_features) > 0) {
				950	paste(unique(morph_features), collapse = "\|")
				951	} else {
				952	NA
				953	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	954	morph_tags <- c(morph_tags, morph_tag)
				955	}
				956	}
				957	}
				958	}
				959
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	960	xml_morph <- extract_morph_via_xml(xml_snippet)
				961	if (!is.null(xml_morph) && length(xml_morph$morph) > 0) {
				962	morph_tags <- xml_morph$morph
				963	}
				964
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	965	# Ensure all vectors have the same length
				966	max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
				967	if (max_length > 0) {
				968	tokens <- c(tokens, rep(NA, max_length - length(tokens)))
				969	lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
				970	pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
				971	morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
				972	}
				973
				974	return(list(
				975	token = tokens,
				976	lemma = lemmas,
				977	pos = pos_tags,
				978	morph = morph_tags
				979	))
				980	}
				981
				982	#'
				983	#' Parse XML annotations into linguistic layers with left/match/right structure
				984	#'
				985	#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
				986	#' from XML annotation snippets returned by the KorAP API, split into left context,
				987	#' match, and right context sections like the tokens field.
				988	#'
				989	#' @param xml_snippet XML string containing annotation data
				990	#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
				991	#' @keywords internal
				992	parse_xml_annotations_structured <- function(xml_snippet) {
				993	if (is.null(xml_snippet) \|\| is.na(xml_snippet) \|\| xml_snippet == "") {
				994	empty_result <- list(left = character(0), match = character(0), right = character(0))
				995	return(list(
				996	atokens = empty_result,
				997	lemma = empty_result,
				998	pos = empty_result,
				999	morph = empty_result
				1000	))
				1001	}
				1002
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	1003	extract_morphological_features_via_xml <- function(section_content) {
				1004	snippet <- paste0("<root>", section_content, "</root>")
				1005	doc <- tryCatch(xml2::read_html(snippet), error = function(e) NULL)
				1006	if (is.null(doc)) return(NULL)
				1007
				1008	nodes <- xml2::xml_find_all(doc, ".//span[not(.//span)]")
				1009	if (length(nodes) == 0) {
				1010	return(list(tokens = character(0), morph = character(0)))
				1011	}
				1012
				1013	tokens_xml <- character(0)
				1014	morph_vals <- character(0)
				1015
				1016	for (node in nodes) {
				1017	token_text <- trimws(xml2::xml_text(node))
				1018	if (identical(token_text, "")) next
				1019
				1020	tokens_xml <- c(tokens_xml, token_text)
				1021
				1022	ancestors <- xml2::xml_find_all(node, "ancestor-or-self::span")
				1023	titles <- xml2::xml_attr(ancestors, "title")
				1024	titles <- titles[!is.na(titles)]
				1025
				1026	feature_tokens <- character(0)
				1027	if (length(titles) > 0) {
				1028	bits <- unlist(strsplit(titles, "[[:space:]]+"))
				1029	bits <- bits[grepl('/m:', bits)]
				1030	if (length(bits) > 0) {
				1031	feature_tokens <- sub('.?/m:(.)$', '\\1', bits, perl = TRUE)
				1032	feature_tokens <- feature_tokens[!duplicated(feature_tokens)]
				1033	}
				1034	}
				1035
				1036	if (length(feature_tokens) == 0) {
				1037	morph_vals <- c(morph_vals, NA_character_)
				1038	} else {
				1039	morph_vals <- c(morph_vals, paste(feature_tokens, collapse = "\|"))
				1040	}
				1041	}
				1042
				1043	list(tokens = tokens_xml, morph = morph_vals)
				1044	}
				1045
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1046	# Helper function to extract annotations from a span section
				1047	extract_annotations_from_section <- function(section_content) {
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1048	# Remove any <mark>...</mark> tags that may interrupt token boundaries
				1049	section_no_marks <- gsub('</?mark[^>]*>', '', section_content, perl = TRUE)
				1050	# Normalize separators between adjacent top-level spans so splitting is robust.
				1051	# Replace any punctuation/entity/space run between one-or-more closing spans and the next opening span
				1052	# with a single space, preserving all closing spans.
				1053	section_norm <- gsub('((?:</span>)+)[[:space:]](?:&[^;]+;\|[[:punct:]]\|[[:space:]])[[:space:]]*(<span)', '\\1 \\2', section_no_marks, perl = TRUE)
				1054	# Handle both spaced tokens and nested single tokens by scanning innermost spans with direct text
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1055	tokens <- character(0)
				1056	lemmas <- character(0)
				1057	pos_tags <- character(0)
				1058	morph_tags <- character(0)
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1059
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1060	pat_token <- '<span[^>]title="([^"])"[^>]*>([^<]+)</span>'
				1061	mm <- gregexpr(pat_token, section_norm, perl = TRUE)
				1062	if (mm[[1]][1] != -1) {
				1063	starts <- mm[[1]]
				1064	lens <- attr(mm[[1]], 'match.length')
				1065	for (k in seq_along(starts)) {
				1066	s <- starts[k]
				1067	e <- s + lens[k] - 1
				1068	fragment <- substr(section_norm, s, e)
				1069	text_content <- sub(pat_token, '\\2', fragment, perl = TRUE)
				1070	text_content <- trimws(text_content)
				1071	title_content <- sub(pat_token, '\\1', fragment, perl = TRUE)
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1072
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1073	if (nchar(text_content) == 0) next
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1074
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1075	lemma <- NA
				1076	pos_tag <- NA
				1077	morph_features <- character(0)
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1078
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1079	# parse inner title
				1080	ann <- unlist(strsplit(title_content, "[[:space:]]+"))
				1081	for (a in ann) {
				1082	if (grepl('/l:', a)) {
				1083	lemma <- sub('.?/l:(.)$', '\\1', a, perl = TRUE)
				1084	} else if (grepl('/p:', a)) {
				1085	pos_tag <- sub('.?/p:(.)$', '\\1', a, perl = TRUE)
				1086	} else if (grepl('/m:', a)) {
				1087	morph_features <- c(morph_features, sub('.?/m:(.)$', '\\1', a, perl = TRUE))
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1088	}
				1089	}
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1090
				1091	# If lemma missing, look back in nearby context for the nearest title containing l:
				1092	if (is.na(lemma) \|\| nchar(lemma) == 0) {
				1093	ctx_start <- max(1, s - 500)
				1094	context <- substr(section_norm, ctx_start, s - 1)
				1095	tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
				1096	if (tmm[[1]][1] != -1) {
				1097	ctx_titles <- regmatches(context, tmm)[[1]]
				1098	for (ti in rev(ctx_titles)) {
				1099	cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
				1100	if (grepl('/l:', cont)) {
				1101	lemma <- sub('.?/l:([^ ]+).', '\\1', cont, perl = TRUE)
				1102	break
				1103	}
				1104	}
				1105	}
				1106	}
				1107
				1108	# If POS missing, keep NA; morphological features may also appear in outer titles
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	1109	ctx_start <- max(1, s - 500)
				1110	context <- substr(section_norm, ctx_start, s - 1)
				1111	tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
				1112	if (tmm[[1]][1] != -1) {
				1113	ctx_titles <- regmatches(context, tmm)[[1]]
				1114	collecting <- FALSE
				1115	for (ti in rev(ctx_titles)) {
				1116	cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
				1117	if (grepl('/m:', cont)) {
				1118	collecting <- TRUE
				1119	mparts <- unlist(strsplit(cont, "[[:space:]]+"))
				1120	features <- sub('.?/m:(.)$', '\\1', mparts[grepl('/m:', mparts)], perl = TRUE)
				1121	if (length(features) > 0) {
				1122	new_features <- features[!features %in% morph_features]
				1123	morph_features <- c(morph_features, new_features)
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1124	}
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	1125	} else if (collecting) {
				1126	break
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1127	}
				1128	}
				1129	}
				1130
				1131	tokens <- c(tokens, text_content)
				1132	lemmas <- c(lemmas, if (!is.null(lemma)) lemma else NA)
				1133	pos_tags <- c(pos_tags, if (!is.null(pos_tag)) pos_tag else NA)
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	1134	morph_tags <- c(morph_tags, if (length(morph_features) > 0) paste(unique(morph_features), collapse = "\|") else NA)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1135	}
				1136	}
				1137
Marc Kupietz	cd45218	2025-10-09 13:28:41 +0200	[diff] [blame^]	1138	# Optionally replace morphological tags using XML-based extraction if it aligns with tokens
				1139	xml_morph <- extract_morphological_features_via_xml(section_content)
				1140	if (!is.null(xml_morph) && length(xml_morph$morph) > 0) {
				1141	morph_tags <- xml_morph$morph
				1142	}
				1143
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1144	# Ensure all vectors have the same length
				1145	max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
				1146	if (max_length > 0) {
				1147	tokens <- c(tokens, rep(NA, max_length - length(tokens)))
				1148	lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
				1149	pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
				1150	morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
				1151	}
				1152
				1153	return(list(
				1154	tokens = tokens,
				1155	lemmas = lemmas,
				1156	pos_tags = pos_tags,
				1157	morph_tags = morph_tags
				1158	))
				1159	}
				1160
				1161	# Split the XML into three parts: left context, match content, and right context
				1162	# The structure is: <span class="match">...left...<mark>...match...</mark>...right...</span>
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1163
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1164	# First extract the content within the match span using DOTALL modifier
				1165	match_span_pattern <- '(?s)<span class="match">(.?)</span>\\s<span class="context-right">'
				1166	match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1167
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1168	if (match_span_match == -1) {
				1169	# Try alternative pattern if no context-right
				1170	match_span_pattern <- '(?s)<span class="match">(.?)</span>\\s$'
				1171	match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
				1172	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1173
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1174	if (match_span_match > 0) {
				1175	match_span_content <- gsub(match_span_pattern, '\\1', xml_snippet, perl = TRUE)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1176
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1177	# Now find the <mark> and </mark> positions within this content
				1178	mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1179	# Use the LAST closing </mark> to cover multi-part matches
				1180	mark_end_gre <- gregexpr('</mark>', match_span_content, perl = TRUE)
				1181	mark_end_positions <- mark_end_gre[[1]]
				1182	mark_end <- if (!is.null(mark_end_positions) && length(mark_end_positions) > 0 && mark_end_positions[1] != -1)
				1183	mark_end_positions[length(mark_end_positions)] else -1
				1184	mark_end_len <- if (mark_end != -1) attr(mark_end_gre[[1]], "match.length")[length(mark_end_positions)] else 0
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1185
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1186	if (mark_start > 0 && mark_end > 0) {
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1187	# Left context: everything before first <mark>
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1188	left_content <- substr(match_span_content, 1, mark_start - 1)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1189
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1190	# Match content: everything between first <mark> and last </mark>
				1191	match_content <- substr(match_span_content, mark_start, mark_end + mark_end_len - 1)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1192
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1193	# Right context: everything after last </mark>
				1194	right_content_start <- mark_end + mark_end_len
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1195	right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
				1196	} else {
				1197	# No mark tags found, treat entire match span as match content
				1198	left_content <- ""
				1199	match_content <- match_span_content
				1200	right_content <- ""
				1201	}
				1202	} else {
				1203	# No match span found, treat entire content as match
				1204	left_content <- ""
				1205	match_content <- xml_snippet
				1206	right_content <- ""
				1207	}
				1208
				1209	# Process each section
				1210	left_annotations <- extract_annotations_from_section(left_content)
				1211	match_annotations <- extract_annotations_from_section(match_content)
				1212	right_annotations <- extract_annotations_from_section(right_content)
				1213
				1214	return(list(
				1215	atokens = list(
				1216	left = left_annotations$tokens,
				1217	match = match_annotations$tokens,
				1218	right = right_annotations$tokens
				1219	),
				1220	lemma = list(
				1221	left = left_annotations$lemmas,
				1222	match = match_annotations$lemmas,
				1223	right = right_annotations$lemmas
				1224	),
				1225	pos = list(
				1226	left = left_annotations$pos_tags,
				1227	match = match_annotations$pos_tags,
				1228	right = right_annotations$pos_tags
				1229	),
				1230	morph = list(
				1231	left = left_annotations$morph_tags,
				1232	match = match_annotations$morph_tags,
				1233	right = right_annotations$morph_tags
				1234	)
				1235	))
				1236	}
				1237
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1238	#' Fetch annotations for all collected matches
				1239	#'
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1240	#' `r lifecycle::badge("experimental")`
				1241	#'
				1242	#' `fetchAnnotations` fetches annotations (only token annotations, for now)
				1243	#' for all matches in the `@collectedMatches` slot
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1244	#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1245	#' data frame. The method uses the `matchID` from collected matches.
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1246	#'
				1247	#' Important: For copyright-restricted corpora, users must be authorized via [auth()]
				1248	#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
				1249	#' available for annotation parsing.
				1250	#'
				1251	#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
				1252	#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
				1253	#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
				1254	#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
				1255	#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
				1256	#' - `annotation_snippet`: original XML snippet from the annotation API
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1257	#'
				1258	#' @family corpus search functions
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1259	#' @concept Annotations
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1260	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1261	#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1262	#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1263	#' @param overwrite logical; if TRUE, re-fetch and replace any existing
				1264	#' annotation columns. If FALSE (default), only add missing annotation layers
				1265	#' and preserve already fetched ones (e.g., keep POS/lemma from a previous
				1266	#' foundry while adding morph from another).
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1267	#' @param verbose print progress information if true
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1268	#' @return The updated `kqo` object with annotation columns
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame]	1269	#' @return The updated `kqo` object with annotation columns
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1270	#' like `pos`, `lemma`, `morph` (and `atokens` and `annotation_snippet`)
				1271	#' in the `@collectedMatches` slot. Each column is a data frame
				1272	#' with `left`, `match`, and `right` columns containing list vectors of annotations
				1273	#' for the left context, matched tokens, and right context, respectively.
				1274	#' The original XML snippet for each match is also stored in `annotation_snippet`.
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1275	#'
				1276	#' @examples
				1277	#' \dontrun{
				1278	#'
				1279	#' # Fetch annotations for matches using Tree-Tagger foundry
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1280	#' # Note: Authorization required for copyright-restricted corpora
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1281	#' q <- KorAPConnection() \|>
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1282	#' auth() \|>
				1283	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1284	#' fetchNext(maxFetch = 10) \|>
				1285	#' fetchAnnotations()
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1286	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1287	#' # Access linguistic annotations for match i:
Marc Kupietz	6aa5a0d	2025-09-08 17:51:47 +0200	[diff] [blame]	1288	#' pos_tags <- q@collectedMatches$pos
				1289	#' # Data frame with left/match/right columns for POS tags
				1290	#' lemmas <- q@collectedMatches$lemma
				1291	#' # Data frame with left/match/right columns for lemmas
				1292	#' morphology <- q@collectedMatches$morph
				1293	#' # Data frame with left/match/right columns for morphological tags
				1294	#' atokens <- q@collectedMatches$atokens
				1295	#' # Data frame with left/match/right columns for annotation token text
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1296	#' # Original XML snippet for match i
				1297	#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]]
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1298	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1299	#' # Access specific components:
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1300	#' # POS tags for the matched tokens in match i
				1301	#' match_pos <- q@collectedMatches$pos$match[[i]]
				1302	#' # Lemmas for the left context in match i
				1303	#' left_lemmas <- q@collectedMatches$lemma$left[[i]]
				1304	#' # Token text for the right context in match i
				1305	#' right_tokens <- q@collectedMatches$atokens$right[[i]]
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1306	#'
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1307	#' # Use a different foundry (e.g., MarMoT)
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1308	#' q <- KorAPConnection() \|>
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1309	#' auth() \|>
				1310	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1311	#' fetchNext(maxFetch = 10) \|>
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1312	#' fetchAnnotations(foundry = "marmot")
				1313	#' q@collectedMatches$pos$left[1] # POS tags for the left context of the first match
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1314	#' }
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1315	#' @export
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1316	setMethod("fetchAnnotations", "KorAPQuery", function(kqo,
				1317	foundry = "tt",
				1318	overwrite = FALSE,
				1319	verbose = kqo@korapConnection@verbose) {
				1320	if (is.null(kqo@collectedMatches) \|\|
				1321	nrow(kqo@collectedMatches) == 0) {
				1322	warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
				1323	return(kqo)
				1324	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1325
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1326	df <- kqo@collectedMatches
				1327	kco <- kqo@korapConnection
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1328
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1329	# Initialize annotation columns as data frames (like tokens field)
				1330	# Create the structure more explicitly to avoid assignment issues
				1331	nrows <- nrow(df)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1332
Marc Kupietz	03d2b1a	2025-07-19 09:14:45 +0200	[diff] [blame]	1333	# Pre-compute the empty character vector list to avoid repeated computation
				1334	empty_char_list <- I(replicate(nrows, character(0), simplify = FALSE))
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1335
Marc Kupietz	03d2b1a	2025-07-19 09:14:45 +0200	[diff] [blame]	1336	# Helper function to create annotation data frame structure
				1337	create_annotation_df <- function(empty_list) {
				1338	data.frame(
				1339	left = empty_list,
				1340	match = empty_list,
				1341	right = empty_list,
				1342	stringsAsFactors = FALSE
				1343	)
				1344	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1345
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1346	# Track which annotation columns already existed to decide overwrite behavior
				1347	existing_types <- list(
				1348	pos = "pos" %in% colnames(df),
				1349	lemma = "lemma" %in% colnames(df),
				1350	morph = "morph" %in% colnames(df),
				1351	atokens = "atokens" %in% colnames(df),
				1352	annotation_snippet = "annotation_snippet" %in% colnames(df)
				1353	)
				1354
				1355	# Initialize annotation columns using the helper function
Marc Kupietz	03d2b1a	2025-07-19 09:14:45 +0200	[diff] [blame]	1356	annotation_types <- c("pos", "lemma", "morph", "atokens")
				1357	for (type in annotation_types) {
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1358	if (overwrite \|\| !existing_types[[type]]) {
				1359	df[[type]] <- create_annotation_df(empty_char_list)
				1360	}
Marc Kupietz	03d2b1a	2025-07-19 09:14:45 +0200	[diff] [blame]	1361	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1362
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1363	if (overwrite \|\| !existing_types$annotation_snippet) {
feldmueller	a02f193	2025-09-15 16:38:06 +0200	[diff] [blame]	1364	df$annotation_snippet <- rep(NA_character_, nrows) # Fixed line
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1365	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1366
Marc Kupietz	e8c0fef	2025-07-18 19:59:04 +0200	[diff] [blame]	1367	# Initialize timing for ETA calculation
				1368	start_time <- Sys.time()
				1369	if (verbose) {
				1370	log_info(verbose, paste("Starting to fetch annotations for", nrows, "matches\n"))
				1371	}
				1372
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1373	# Helper to decide if existing annotation row is effectively empty
				1374	is_empty_annotation_row <- function(ann_df, row_index) {
				1375	if (is.null(ann_df) \|\| nrow(ann_df) < row_index) return(TRUE)
				1376	left_val <- ann_df$left[[row_index]]
				1377	match_val <- ann_df$match[[row_index]]
				1378	right_val <- ann_df$right[[row_index]]
				1379	all(
				1380	(is.null(left_val) \|\| (length(left_val) == 0) \|\| all(is.na(left_val))),
				1381	(is.null(match_val) \|\| (length(match_val) == 0) \|\| all(is.na(match_val))),
				1382	(is.null(right_val) \|\| (length(right_val) == 0) \|\| all(is.na(right_val)))
				1383	)
				1384	}
				1385
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1386	for (i in seq_len(nrow(df))) {
Marc Kupietz	e8c0fef	2025-07-18 19:59:04 +0200	[diff] [blame]	1387	# ETA logging
				1388	if (verbose && i > 1) {
				1389	eta_info <- calculate_eta(i, nrows, start_time)
				1390	log_info(verbose, paste("Fetching annotations for match", i, "of", nrows, eta_info, "\n"))
				1391	}
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1392	# Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
				1393	if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1394	# matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
				1395	# "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
				1396	# Extract document path and position, handling both regular and encrypted formats
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1397
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1398	# More flexible regex to extract the document path with position and encryption
				1399	# Look for pattern: match-(...)-p(\d+)-(\d+)(.) where (.) is the encrypted part
				1400	# We need to capture the entire path including the encrypted suffix
				1401	match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1402
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1403	if (match_result > 0) {
				1404	# Extract the complete path including encryption (everything after "match-")
				1405	doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
				1406	# Convert the dash before position to slash, but keep everything after the position
				1407	match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
Marc Kupietz	2512130	2025-07-19 08:45:43 +0200	[diff] [blame]	1408	# Use httr2 to construct URL safely
				1409	base_url <- paste0(kco@apiUrl, "corpus/", match_path)
				1410	req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1411	} else {
Marc Kupietz	2512130	2025-07-19 08:45:43 +0200	[diff] [blame]	1412	# If regex fails, fall back to the old method with httr2
				1413	# Format numbers to avoid scientific notation
				1414	match_start <- format(df$matchStart[i], scientific = FALSE)
				1415	match_end <- format(df$matchEnd[i], scientific = FALSE)
				1416	base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
				1417	req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1418	}
				1419	} else {
Marc Kupietz	2512130	2025-07-19 08:45:43 +0200	[diff] [blame]	1420	# Fallback to the old method with httr2
				1421	# Format numbers to avoid scientific notation
				1422	match_start <- format(df$matchStart[i], scientific = FALSE)
				1423	match_end <- format(df$matchEnd[i], scientific = FALSE)
				1424	base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
				1425	req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1426	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1427
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1428	tryCatch({
				1429	res <- apiCall(kco, req)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1430
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1431	if (!is.null(res)) {
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1432	# Store the raw annotation snippet (respect overwrite flag)
				1433	if (overwrite \|\| !existing_types$annotation_snippet \|\| is.null(df$annotation_snippet[[i]]) \|\| is.na(df$annotation_snippet[[i]])) {
				1434	df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
				1435	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1436
				1437	# Parse XML annotations if snippet is available
				1438	if (is.list(res) && "snippet" %in% names(res)) {
				1439	parsed_annotations <- parse_xml_annotations_structured(res$snippet)
				1440
				1441	# Store the parsed linguistic data in data frame format (like tokens)
				1442	# Use individual assignment to avoid data frame mismatch errors
				1443	tryCatch({
				1444	# Assign POS annotations
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1445	if (overwrite \|\| !existing_types$pos \|\| is_empty_annotation_row(df$pos, i)) {
				1446	df$pos$left[i] <- list(parsed_annotations$pos$left)
				1447	df$pos$match[i] <- list(parsed_annotations$pos$match)
				1448	df$pos$right[i] <- list(parsed_annotations$pos$right)
				1449	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1450
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1451	# Assign lemma annotations
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1452	if (overwrite \|\| !existing_types$lemma \|\| is_empty_annotation_row(df$lemma, i)) {
				1453	df$lemma$left[i] <- list(parsed_annotations$lemma$left)
				1454	df$lemma$match[i] <- list(parsed_annotations$lemma$match)
				1455	df$lemma$right[i] <- list(parsed_annotations$lemma$right)
				1456	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1457
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1458	# Assign morphology annotations
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1459	if (overwrite \|\| !existing_types$morph \|\| is_empty_annotation_row(df$morph, i)) {
				1460	df$morph$left[i] <- list(parsed_annotations$morph$left)
				1461	df$morph$match[i] <- list(parsed_annotations$morph$match)
				1462	df$morph$right[i] <- list(parsed_annotations$morph$right)
				1463	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1464
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1465	# Assign token annotations
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1466	if (overwrite \|\| !existing_types$atokens \|\| is_empty_annotation_row(df$atokens, i)) {
				1467	df$atokens$left[i] <- list(parsed_annotations$atokens$left)
				1468	df$atokens$match[i] <- list(parsed_annotations$atokens$match)
				1469	df$atokens$right[i] <- list(parsed_annotations$atokens$right)
				1470	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1471	}, error = function(assign_error) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1472	# Set empty character vectors on assignment error using list assignment
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1473	if (overwrite \|\| !existing_types$pos) {
				1474	df$pos$left[i] <<- list(character(0))
				1475	df$pos$match[i] <<- list(character(0))
				1476	df$pos$right[i] <<- list(character(0))
				1477	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1478
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1479	if (overwrite \|\| !existing_types$lemma) {
				1480	df$lemma$left[i] <<- list(character(0))
				1481	df$lemma$match[i] <<- list(character(0))
				1482	df$lemma$right[i] <<- list(character(0))
				1483	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1484
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1485	if (overwrite \|\| !existing_types$morph) {
				1486	df$morph$left[i] <<- list(character(0))
				1487	df$morph$match[i] <<- list(character(0))
				1488	df$morph$right[i] <<- list(character(0))
				1489	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1490
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1491	if (overwrite \|\| !existing_types$atokens) {
				1492	df$atokens$left[i] <<- list(character(0))
				1493	df$atokens$match[i] <<- list(character(0))
				1494	df$atokens$right[i] <<- list(character(0))
				1495	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1496	})
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1497	} else {
				1498	# No snippet available, store empty vectors
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1499	if (overwrite \|\| !existing_types$pos) {
				1500	df$pos$left[i] <- list(character(0))
				1501	df$pos$match[i] <- list(character(0))
				1502	df$pos$right[i] <- list(character(0))
				1503	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1504
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1505	if (overwrite \|\| !existing_types$lemma) {
				1506	df$lemma$left[i] <- list(character(0))
				1507	df$lemma$match[i] <- list(character(0))
				1508	df$lemma$right[i] <- list(character(0))
				1509	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1510
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1511	if (overwrite \|\| !existing_types$morph) {
				1512	df$morph$left[i] <- list(character(0))
				1513	df$morph$match[i] <- list(character(0))
				1514	df$morph$right[i] <- list(character(0))
				1515	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1516
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1517	if (overwrite \|\| !existing_types$atokens) {
				1518	df$atokens$left[i] <- list(character(0))
				1519	df$atokens$match[i] <- list(character(0))
				1520	df$atokens$right[i] <- list(character(0))
				1521	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1522	}
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1523	} else {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1524	# Store NAs for failed requests
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1525	if (overwrite \|\| !existing_types$pos) {
				1526	df$pos$left[i] <- list(NA)
				1527	df$pos$match[i] <- list(NA)
				1528	df$pos$right[i] <- list(NA)
				1529	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1530
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1531	if (overwrite \|\| !existing_types$lemma) {
				1532	df$lemma$left[i] <- list(NA)
				1533	df$lemma$match[i] <- list(NA)
				1534	df$lemma$right[i] <- list(NA)
				1535	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1536
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1537	if (overwrite \|\| !existing_types$morph) {
				1538	df$morph$left[i] <- list(NA)
				1539	df$morph$match[i] <- list(NA)
				1540	df$morph$right[i] <- list(NA)
				1541	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1542
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1543	if (overwrite \|\| !existing_types$atokens) {
				1544	df$atokens$left[i] <- list(NA)
				1545	df$atokens$match[i] <- list(NA)
				1546	df$atokens$right[i] <- list(NA)
				1547	}
				1548	if (overwrite \|\| !existing_types$annotation_snippet) {
				1549	df$annotation_snippet[[i]] <- NA
				1550	}
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1551	}
				1552	}, error = function(e) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1553	# Store NAs for failed requests
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1554	if (overwrite \|\| !existing_types$pos) {
				1555	df$pos$left[i] <- list(NA)
				1556	df$pos$match[i] <- list(NA)
				1557	df$pos$right[i] <- list(NA)
				1558	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1559
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1560	if (overwrite \|\| !existing_types$lemma) {
				1561	df$lemma$left[i] <- list(NA)
				1562	df$lemma$match[i] <- list(NA)
				1563	df$lemma$right[i] <- list(NA)
				1564	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1565
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1566	if (overwrite \|\| !existing_types$morph) {
				1567	df$morph$left[i] <- list(NA)
				1568	df$morph$match[i] <- list(NA)
				1569	df$morph$right[i] <- list(NA)
				1570	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1571
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1572	if (overwrite \|\| !existing_types$atokens) {
				1573	df$atokens$left[i] <- list(NA)
				1574	df$atokens$match[i] <- list(NA)
				1575	df$atokens$right[i] <- list(NA)
				1576	}
				1577	if (overwrite \|\| !existing_types$annotation_snippet) {
				1578	df$annotation_snippet[[i]] <- NA
				1579	}
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1580	})
				1581	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1582
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1583	# Validate data frame structure before assignment
				1584	if (nrow(df) != nrow(kqo@collectedMatches)) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1585	}
				1586
				1587	# Update the collectedMatches with annotation data
				1588	tryCatch({
				1589	kqo@collectedMatches <- df
				1590	}, error = function(assign_error) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1591	# Try a safer approach: add columns individually
				1592	tryCatch({
				1593	kqo@collectedMatches$pos <- df$pos
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1594	kqo@collectedMatches$lemma <- df$lemma
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1595	kqo@collectedMatches$morph <- df$morph
				1596	kqo@collectedMatches$atokens <- df$atokens
				1597	kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
				1598	}, error = function(col_error) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1599	warning("Failed to add annotation data to collectedMatches")
				1600	})
				1601	})
				1602
Marc Kupietz	e8c0fef	2025-07-18 19:59:04 +0200	[diff] [blame]	1603	if (verbose) {
				1604	elapsed_time <- Sys.time() - start_time
				1605	log_info(verbose, paste("Finished fetching annotations for", nrows, "matches in", format_duration(as.numeric(elapsed_time, units = "secs")), "\n"))
				1606	}
				1607
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1608	return(kqo)
				1609	})
				1610
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1611	#' Query frequencies of search expressions in virtual corpora
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1612	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	1613	#' `frequencyQuery` combines [corpusQuery()], [corpusStats()] and
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1614	#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1615	#' confidence intervals of one ore multiple search terms across one or multiple
				1616	#' virtual corpora.
				1617	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	1618	#' @family frequency analysis
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1619	#' @aliases frequencyQuery
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1620	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	1621	#' \dontrun{
				1622	#'
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1623	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1624	#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	1625	#' }
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1626	#'
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1627	# @inheritParams corpusQuery
Marc Kupietz	617266d	2025-02-27 10:43:07 +0100	[diff] [blame]	1628	#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1629	#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
				1630	#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	1631	#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
				1632	#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1633	#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1634	#' @export
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1635	#'
				1636	#' @return A tibble, with each row containing the following result columns for query and vc combinations:
				1637	#' - query: the query string used for the frequency analysis.
				1638	#' - totalResults: absolute frequency of query matches in the vc.
				1639	#' - vc: virtual corpus used for the query.
				1640	#' - webUIRequestUrl: URL of the corresponding web UI request with respect to query and vc.
				1641	#' - total: total number of words in vc.
				1642	#' - f: relative frequency of query matches in the vc.
				1643	#' - conf.low: lower bound of the confidence interval for the relative frequency, given `conf.level`.
				1644	#' - conf.high: upper bound of the confidence interval for the relative frequency, given `conf.level`.
				1645
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1646	setMethod(
				1647	"frequencyQuery", "KorAPConnection",
Marc Kupietz	71d6e05	2019-11-22 18:42:10 +0100	[diff] [blame]	1648	function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1649	(if (as.alternatives) {
				1650	corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) \|>
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	1651	group_by(vc) \|>
Marc Kupietz	71d6e05	2019-11-22 18:42:10 +0100	[diff] [blame]	1652	mutate(total = sum(totalResults))
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1653	} else {
				1654	corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) \|>
				1655	mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	1656	}) \|>
Marc Kupietz	0c29cea	2019-10-09 08:44:36 +0200	[diff] [blame]	1657	ci(conf.level = conf.level)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1658	}
				1659	)
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1660
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1661	#' buildWebUIRequestUrlFromString
				1662	#'
				1663	#' @rdname KorAPQuery-class
				1664	#' @importFrom urltools url_encode
				1665	#' @export
				1666	buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1667	query,
				1668	vc = "",
				1669	ql = "poliqarp") {
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1670	if ("KorAPConnection" %in% class(KorAPUrl)) {
				1671	KorAPUrl <- KorAPUrl@KorAPUrl
				1672	}
				1673
				1674	request <-
				1675	paste0(
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1676	"?q=",
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1677	urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1678	ifelse(vc != "",
				1679	paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
				1680	""
				1681	),
				1682	"&ql=",
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1683	ql
				1684	)
				1685	paste0(KorAPUrl, request)
				1686	}
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1687
				1688	#' buildWebUIRequestUrl
				1689	#'
				1690	#' @rdname KorAPQuery-class
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1691	#' @importFrom httr2 url_parse
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1692	#' @export
				1693	buildWebUIRequestUrl <- function(kco,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1694	query = if (missing(KorAPUrl)) {
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1695	stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1696	} else {
				1697	httr2::url_parse(KorAPUrl)$query$q
				1698	},
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1699	vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1700	KorAPUrl,
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1701	ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1702	buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1703	}
				1704
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1705	#' format()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1706	#' @rdname KorAPQuery-class
				1707	#' @param x KorAPQuery object
				1708	#' @param ... further arguments passed to or from other methods
Marc Kupietz	b73ca0f	2025-01-28 20:45:01 +0100	[diff] [blame]	1709	#' @importFrom urltools param_get url_decode
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1710	#' @export
				1711	format.KorAPQuery <- function(x, ...) {
				1712	cat("<KorAPQuery>\n")
				1713	q <- x
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1714	param <- urltools::param_get(q@request) \|> lapply(urltools::url_decode)
Marc Kupietz	b73ca0f	2025-01-28 20:45:01 +0100	[diff] [blame]	1715	cat(" Query: ", param$q, "\n")
				1716	if (!is.null(param$cq) && param$cq != "") {
				1717	cat(" Virtual corpus: ", param$cq, "\n")
				1718	}
				1719	if (!is.null(q@collectedMatches)) {
				1720	cat("==============================================================================================================", "\n")
				1721	print(summary(q@collectedMatches))
				1722	cat("==============================================================================================================", "\n")
				1723	}
				1724	cat(" Total results: ", q@totalResults, "\n")
				1725	cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1726	if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
				1727	successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
				1728	parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
				1729	cat(" Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
				1730	if (parsed_annotations > 0) {
				1731	cat(" (", parsed_annotations, " with parsed linguistic data)")
				1732	}
				1733	cat("\n")
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1734	}
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1735	}
				1736
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1737	#' show()
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1738	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1739	#' @rdname KorAPQuery-class
				1740	#' @param object KorAPQuery object
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1741	#' @export
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1742	setMethod("show", "KorAPQuery", function(object) {
				1743	format(object)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1744	invisible(object)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1745	})