Blame - R/KorAPQuery.R - KorAP/RKorAPClient

blob: ef58c3dd18a31c5c430a3e88a911ff2fd64e9b30 [file] [log] [blame]

Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	1	#' KorAPQuery class (internal)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	2	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	3	#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	4	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	5	#' @keywords internal
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	6	#' @include KorAPConnection.R
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	7	#' @include logging.R
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	8	#' @import httr2
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	9	#'
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	10	#' @include RKorAPClient-package.R
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	11
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	12	#' @export
				13	KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietz	b897218	2019-09-20 21:33:46 +0200	[diff] [blame]	14	"korapConnection",
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	15	"request",
				16	"vc",
				17	"totalResults",
				18	"nextStartIndex",
				19	"fields",
				20	"requestUrl",
				21	"webUIRequestUrl",
				22	"apiResponse",
				23	"collectedMatches",
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	24	"hasMoreMatches"
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	25	))
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	26
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	27	#' Initialize KorAPQuery object
				28	#' @keywords internal
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	29	#' @param .Object …
Marc Kupietz	b897218	2019-09-20 21:33:46 +0200	[diff] [blame]	30	#' @param korapConnection KorAPConnection object
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	31	#' @param request query part of the request URL
				32	#' @param vc definition of a virtual corpus
				33	#' @param totalResults number of hits the query has yielded
				34	#' @param nextStartIndex at what index to start the next fetch of query results
				35	#' @param fields what data / metadata fields should be collected
				36	#' @param requestUrl complete URL of the API request
				37	#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
				38	#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz	7776dec	2019-09-27 16:59:02 +0200	[diff] [blame]	39	#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	40	#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz	97a1bca	2019-10-04 22:52:09 +0200	[diff] [blame]	41	#'
				42	#' @importFrom tibble tibble
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	43	#' @export
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	44	setMethod(
				45	"initialize", "KorAPQuery",
				46	function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
				47	"corpusSigle", "textSigle", "pubDate", "pubPlace",
				48	"availability", "textClass", "snippet", "tokens"
				49	),
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	50	requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	51	.Object <- callNextMethod()
				52	.Object@korapConnection <- korapConnection
				53	.Object@request <- request
				54	.Object@vc <- vc
				55	.Object@totalResults <- totalResults
				56	.Object@nextStartIndex <- nextStartIndex
				57	.Object@fields <- fields
				58	.Object@requestUrl <- requestUrl
				59	.Object@webUIRequestUrl <- webUIRequestUrl
				60	.Object@apiResponse <- apiResponse
				61	.Object@hasMoreMatches <- hasMoreMatches
				62	.Object@collectedMatches <- collectedMatches
				63	.Object
				64	}
				65	)
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	66
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	67	setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
				68	setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
				69	setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
				70	setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	71	setGeneric("fetchAnnotations", function(kqo, ...) standardGeneric("fetchAnnotations"))
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	72	setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	73
				74	maxResultsPerPage <- 50
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	75
Marc Kupietz	4de53ec	2019-10-04 09:12:00 +0200	[diff] [blame]	76	## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietz	ef1ef4a	2025-02-19 12:12:40 +0100	[diff] [blame]	77	utils::globalVariables(c("."))
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	78
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	79	#' Search corpus for query terms
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	80	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	81	#' `corpusQuery` performs a corpus query via a connection to a KorAP-API-server
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	82	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	83	#' @family corpus search functions
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	84	#' @aliases corpusQuery
				85	#'
				86	#' @importFrom urltools url_encode
				87	#' @importFrom purrr pmap
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	88	#' @importFrom dplyr bind_rows group_by
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	89	#'
Marc Kupietz	617266d	2025-02-27 10:43:07 +0100	[diff] [blame]	90	#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	91	#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	92	#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	93	#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz	132f005	2023-04-16 14:23:05 +0200	[diff] [blame]	94	#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
				95	#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
				96	#' your RKorAPClient application as explained in the
				97	#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
				98	#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
				99	#' `FALSE`.
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	100	#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	101	#' @param fields character vector specifying which metadata fields to retrieve for each match.
				102	#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
				103	#' \describe{
				104	#' \item{Text identification:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
				105	#' \item{Publication info:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
				106	#' \item{Temporal data:}{`pubDate`, `creationDate` - when text was published/created}
				107	#' \item{Publication details:}{`pubPlace`, `publisher`, `reference` - where/how published}
				108	#' \item{Text classification:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
				109	#' \item{Adminstrative and technical info:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
				110	#' \item{Content data:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
				111	#' \item{System data:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
				112	#' }
				113	#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
				114	#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz	43a6ade	2020-02-18 17:01:44 +0100	[diff] [blame]	115	#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz	25aebc3	2019-09-16 18:40:50 +0200	[diff] [blame]	116	#' @param verbose print some info
Marc Kupietz	4de53ec	2019-10-04 09:12:00 +0200	[diff] [blame]	117	#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	118	#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietz	d9b2fd7	2023-04-17 19:08:50 +0200	[diff] [blame]	119	#' @param context string that specifies the size of the left and the right context returned in `snippet`
				120	#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
				121	#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
				122	#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
				123	#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
				124	#' as this is typically legally motivated.
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	125	#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	126	#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
				127	#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	128	#'
				129	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	130	#' \dontrun{
				131	#'
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	132	#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	133	#' KorAPConnection() \|>
				134	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	135	#' fetchAll()
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	136	#'
				137	#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	138	#' query <- KorAPConnection() \|>
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	139	#' corpusQuery("Ameisenplage",
				140	#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
				141	#' results <- fetchAll(query)
				142	#' results@collectedMatches
Marc Kupietz	657d8e7	2020-02-25 18:31:50 +0100	[diff] [blame]	143	#' }
Marc Kupietz	3c531f6	2019-09-13 12:17:24 +0200	[diff] [blame]	144	#'
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	145	#' \dontrun{
				146	#'
Marc Kupietz	603491f	2019-09-18 14:01:02 +0200	[diff] [blame]	147	#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
				148	#' # and show the number of query hits (but don't fetch them).
Marc Kupietz	69cc54a	2019-09-30 12:06:54 +0200	[diff] [blame]	149	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	150	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	151	#' corpusQuery(
				152	#' KorAPUrl =
				153	#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
				154	#' )
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	155	#' }
				156	#'
				157	#' \dontrun{
Marc Kupietz	3c531f6	2019-09-13 12:17:24 +0200	[diff] [blame]	158	#'
Marc Kupietz	603491f	2019-09-18 14:01:02 +0200	[diff] [blame]	159	#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	160	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	161	#' {
				162	#' . ->> kco
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	163	#' } \|>
				164	#' corpusQuery("Ameisenplage") \|>
				165	#' fetchAll() \|>
				166	#' slot("collectedMatches") \|>
				167	#' mutate(year = lubridate::year(pubDate)) \|>
				168	#' dplyr::select(year) \|>
				169	#' group_by(year) \|>
				170	#' summarise(Count = dplyr::n()) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	171	#' mutate(Freq = mapply(function(f, y) {
				172	#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	173	#' }, Count, year)) \|>
				174	#' dplyr::select(-Count) \|>
				175	#' complete(year = min(year):max(year), fill = list(Freq = 0)) \|>
Marc Kupietz	69cc54a	2019-09-30 12:06:54 +0200	[diff] [blame]	176	#' plot(type = "l")
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	177	#' }
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	178	#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	179	#'
				180	#' @references
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	181	#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	182	#'
				183	#' @export
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	184	setMethod(
				185	"corpusQuery", "KorAPConnection",
				186	function(kco,
				187	query = if (missing(KorAPUrl)) {
				188	stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
				189	} else {
				190	httr2::url_parse(KorAPUrl)$query$q
				191	},
				192	vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
				193	KorAPUrl,
				194	metadataOnly = TRUE,
				195	ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
				196	fields = c(
				197	"corpusSigle",
				198	"textSigle",
				199	"pubDate",
				200	"pubPlace",
				201	"availability",
				202	"textClass",
				203	"snippet",
				204	"tokens"
				205	),
				206	accessRewriteFatal = TRUE,
				207	verbose = kco@verbose,
				208	expand = length(vc) != length(query),
				209	as.df = FALSE,
				210	context = NULL) {
				211	if (length(query) > 1 \|\| length(vc) > 1) {
				212	grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	213
				214	# Initialize timing variables for ETA calculation
				215	total_queries <- nrow(grid)
				216	current_query <- 0
				217	start_time <- Sys.time()
				218
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	219	results <- purrr::pmap(grid, function(query, vc, ...) {
				220	current_query <<- current_query + 1
				221
				222	# Execute the single query directly (avoiding recursive call)
				223	contentFields <- c("snippet", "tokens")
				224	query_fields <- fields
				225	if (metadataOnly) {
				226	query_fields <- query_fields[!query_fields %in% contentFields]
				227	}
				228	if (!"textSigle" %in% query_fields) {
				229	query_fields <- c(query_fields, "textSigle")
				230	}
				231	request <-
				232	paste0(
				233	"?q=",
				234	url_encode(enc2utf8(query)),
				235	ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
				236	ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
				237	ifelse(!metadataOnly, "&show-tokens=true", ""),
				238	"&ql=", ql
				239	)
				240	webUIRequestUrl <- paste0(kco@KorAPUrl, request)
				241	requestUrl <- paste0(
				242	kco@apiUrl,
				243	"search",
				244	request,
				245	"&fields=",
				246	paste(query_fields, collapse = ","),
				247	if (metadataOnly) "&access-rewrite-disabled=true" else ""
				248	)
				249
				250	# Show individual query progress
				251	log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
				252	res <- apiCall(kco, paste0(requestUrl, "&count=0"))
				253	if (is.null(res)) {
				254	log_info(verbose, ": API call failed\n")
				255	totalResults <- 0
				256	} else {
				257	totalResults <- as.integer(res$meta$totalResults)
				258	log_info(verbose, ": ", totalResults, " hits")
				259	if (!is.null(res$meta$cached)) {
				260	log_info(verbose, " [cached]")
				261	} else if (!is.null(res$meta$benchmark)) {
				262	if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
				263	time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
				264	formatted_time <- paste0(round(time_value, 2), "s")
				265	log_info(verbose, ", took ", formatted_time)
				266	} else {
				267	log_info(verbose, ", took ", res$meta$benchmark)
				268	}
				269	}
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	270
				271	# Calculate and display ETA information on the same line if verbose and we have more than one query
				272	if (verbose && total_queries > 1) {
				273	eta_info <- calculate_eta(current_query, total_queries, start_time)
				274	if (eta_info != "") {
				275	elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
				276	avg_time_per_query <- elapsed_time / current_query
				277
				278	# Add ETA info to the same line - remove the leading ". " for cleaner formatting
				279	clean_eta_info <- sub("^\\. ", ". ", eta_info)
				280	log_info(verbose, clean_eta_info)
				281	}
				282	}
				283
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	284	log_info(verbose, "\n")
				285	}
				286
				287	result <- data.frame(
				288	query = query,
				289	totalResults = totalResults,
				290	vc = vc,
				291	webUIRequestUrl = webUIRequestUrl,
				292	stringsAsFactors = FALSE
				293	)
				294
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	295	return(result)
				296	})
				297
				298	results %>% bind_rows()
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	299	} else {
Marc Kupietz	2078bde	2023-08-27 16:46:15 +0200	[diff] [blame]	300	contentFields <- c("snippet", "tokens")
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	301	if (metadataOnly) {
				302	fields <- fields[!fields %in% contentFields]
				303	}
Marc Kupietz	80dc643	2025-02-07 16:57:40 +0100	[diff] [blame]	304	if (!"textSigle" %in% fields) {
				305	fields <- c(fields, "textSigle")
				306	}
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	307	request <-
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	308	paste0(
				309	"?q=",
				310	url_encode(enc2utf8(query)),
				311	ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
				312	ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
				313	ifelse(!metadataOnly, "&show-tokens=true", ""),
				314	"&ql=", ql
				315	)
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	316	webUIRequestUrl <- paste0(kco@KorAPUrl, request)
				317	requestUrl <- paste0(
				318	kco@apiUrl,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	319	"search",
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	320	request,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	321	"&fields=",
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	322	paste(fields, collapse = ","),
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	323	if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	324	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	325	log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
				326	sep =
				327	""
				328	)
				329	res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	330	if (is.null(res)) {
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	331	message("API call failed.")
				332	totalResults <- 0
				333	} else {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	334	totalResults <- as.integer(res$meta$totalResults)
Marc Kupietz	a47d150	2023-04-18 15:26:47 +0200	[diff] [blame]	335	log_info(verbose, ": ", totalResults, " hits")
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	336	if (!is.null(res$meta$cached)) {
Marc Kupietz	a47d150	2023-04-18 15:26:47 +0200	[diff] [blame]	337	log_info(verbose, " [cached]\n")
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	338	} else if (!is.null(res$meta$benchmark)) {
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	339	# Round the benchmark time to 2 decimal places for better readability
				340	# If it's a string ending with 's', extract the number, round it, and re-add 's'
				341	if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
				342	time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
				343	formatted_time <- paste0(round(time_value, 2), "s")
				344	log_info(verbose, ", took ", formatted_time, "\n", sep = "")
				345	} else {
				346	# Fallback if the format is different than expected
				347	log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
				348	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	349	} else {
				350	log_info(verbose, "\n")
				351	}
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	352	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	353	if (as.df) {
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	354	data.frame(
				355	query = query,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	356	totalResults = totalResults,
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	357	vc = vc,
				358	webUIRequestUrl = webUIRequestUrl,
				359	stringsAsFactors = FALSE
				360	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	361	} else {
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	362	KorAPQuery(
				363	korapConnection = kco,
				364	nextStartIndex = 0,
				365	fields = fields,
				366	requestUrl = requestUrl,
				367	request = request,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	368	totalResults = totalResults,
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	369	vc = vc,
				370	apiResponse = res,
				371	webUIRequestUrl = webUIRequestUrl,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	372	hasMoreMatches = (totalResults > 0),
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	373	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	374	}
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	375	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	376	}
				377	)
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	378
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	379	#' @importFrom purrr map
				380	repair_data_strcuture <- function(x) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	381	if (is.list(x)) {
				382	as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	383	paste(.x, collapse = " ")
				384	} else {
				385	.x
				386	}))
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	387	} else {
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	388	ifelse(is.na(x), "", x)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	389	}
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	390	}
				391
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	392	#' Fetch the next bunch of results of a KorAP query.
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	393	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	394	#' `fetchNext` fetches the next bunch of results of a KorAP query.
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	395	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	396	#' @family corpus search functions
				397	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	398	#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	399	#' @param offset start offset for query results to fetch
				400	#' @param maxFetch maximum number of query results to fetch
Marc Kupietz	25aebc3	2019-09-16 18:40:50 +0200	[diff] [blame]	401	#' @param verbose print progress information if true
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	402	#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
				403	#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	404	#'
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	405	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	406	#' \dontrun{
				407	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	408	#' q <- KorAPConnection() \|>
				409	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	410	#' fetchNext()
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	411	#' q@collectedMatches
Marc Kupietz	657d8e7	2020-02-25 18:31:50 +0100	[diff] [blame]	412	#' }
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	413	#'
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	414	#' @references
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	415	#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	416	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	417	#' @aliases fetchNext
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	418	#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	419	#' @importFrom tibble enframe add_column
				420	#' @importFrom stringr word
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	421	#' @importFrom tidyr unnest unchop pivot_wider
				422	#' @importFrom purrr map
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	423	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	424	setMethod("fetchNext", "KorAPQuery", function(kqo,
				425	offset = kqo@nextStartIndex,
				426	maxFetch = maxResultsPerPage,
				427	verbose = kqo@korapConnection@verbose,
				428	randomizePageOrder = FALSE) {
Marc Kupietz	a7a8f1b	2024-12-18 15:56:19 +0100	[diff] [blame]	429	# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	430	results <- key <- name <- tmp_positions <- 0
Marc Kupietz	a7a8f1b	2024-12-18 15:56:19 +0100	[diff] [blame]	431
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	432	if (kqo@totalResults == 0 \|\| offset >= kqo@totalResults) {
				433	return(kqo)
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	434	}
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	435	use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	436	# Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	437	collectedMatches <- kqo@collectedMatches
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	438
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	439	# Track start time for ETA calculation
				440	start_time <- Sys.time()
				441
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	442	# For randomized page order, generate a list of randomized page indices
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	443	if (randomizePageOrder) {
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	444	# Calculate how many pages we need to fetch based on maxFetch
				445	total_pages_to_fetch <- if (!is.na(maxFetch)) {
				446	# Either limited by maxFetch or total results, whichever is smaller
				447	min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
				448	} else {
				449	# All pages
				450	ceiling(kqo@totalResults / maxResultsPerPage)
				451	}
				452
				453	# Generate randomized page indices (0-based for API)
				454	pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
				455	page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	456	}
				457
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	458	if (is.null(collectedMatches)) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	459	collectedMatches <- data.frame()
				460	}
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	461
				462	# Initialize the page counter properly based on nextStartIndex and any previously fetched results
				463	# We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
				464	# For first call, this will be 1, for subsequent calls, it will reflect our actual position
				465	current_page_number <- ceiling(offset / maxResultsPerPage) + 1
				466
				467	# For sequential fetches, keep track of which global page we're on
				468	# This is important for correctly showing page numbers in subsequent fetchNext calls
				469	page_count_start <- current_page_number
				470
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	471	repeat {
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	472	# Determine which page to fetch next
				473	if (randomizePageOrder) {
				474	# In randomized mode, get the page from our randomized list using the page_index
				475	# Make sure we don't exceed the array bounds
				476	if (page_index > length(pages)) {
				477	break # No more pages to fetch in randomized mode
				478	}
				479	current_offset_page <- pages[page_index]
				480	# For display purposes in randomized mode, show which page out of the total we're fetching
				481	display_page_number <- page_index
				482	} else {
				483	# In sequential mode, use the current_page_number to calculate the offset
				484	current_offset_page <- (current_page_number - 1)
				485	display_page_number <- current_page_number
				486	}
				487
				488	# Calculate the actual offset in tokens
				489	currentOffset <- current_offset_page * maxResultsPerPage
				490
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	491	# Build the query with the appropriate count and offset using httr2
				492	count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	493
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	494	# Parse existing URL to preserve all query parameters
				495	parsed_url <- httr2::url_parse(kqo@requestUrl)
				496	existing_query <- parsed_url$query
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	497
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	498	# Add/update count and offset parameters
				499	existing_query$count <- count_param
				500	existing_query$offset <- currentOffset
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	501
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	502	# Rebuild the URL with all parameters
				503	query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	504	res <- apiCall(kqo@korapConnection, query)
				505	if (length(res$matches) == 0) {
				506	break
				507	}
				508
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	509	if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) \|\| as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz	16ccf11	2025-01-26 13:25:27 +0100	[diff] [blame]	510	log_info(verbose, "Using fields API: ")
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	511	currentMatches <- res$matches$fields %>%
				512	purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
				513	tibble::enframe() %>%
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	514	tidyr::unnest(cols = value) %>%
				515	tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	516	dplyr::select(-name)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	517	if ("snippet" %in% colnames(res$matches)) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	518	currentMatches$snippet <- res$matches$snippet
				519	}
Marc Kupietz	3cd2c6c	2025-01-08 20:35:39 +0100	[diff] [blame]	520	if ("tokens" %in% colnames(res$matches)) {
				521	currentMatches$tokens <- res$matches$tokens
				522	}
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	523	} else {
				524	currentMatches <- res$matches
				525	}
				526
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	527	for (field in kqo@fields) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	528	if (!field %in% colnames(currentMatches)) {
				529	currentMatches[, field] <- NA
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	530	}
				531	}
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	532	currentMatches <- currentMatches %>%
				533	select(kqo@fields) %>%
				534	mutate(
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	535	matchID = res$matches$matchID,
Marc Kupietz	0447da0	2025-01-08 20:51:09 +0100	[diff] [blame]	536	tmp_positions = gsub(".-p(\\d+)-(\\d+).", "\\1 \\2", res$matches$matchID),
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	537	matchStart = as.integer(stringr::word(tmp_positions, 1)),
				538	matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
				539	) %>%
				540	select(-tmp_positions)
				541
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	542	if (!is.list(collectedMatches)) {
				543	collectedMatches <- currentMatches
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	544	} else {
Marc Kupietz	2078bde	2023-08-27 16:46:15 +0200	[diff] [blame]	545	collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	546	}
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	547
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	548	# Get the actual items per page from the API response
				549	# We now consistently use maxResultsPerPage instead
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	550
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	551	# Calculate total pages consistently using fixed maxResultsPerPage
				552	# This ensures consistent page counting across the function
				553	total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
				554
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	555	# Calculate ETA using the centralized function from logging.R
				556	current_page <- if (randomizePageOrder) page_index else display_page_number
				557	total_pages_to_fetch <- if (!is.na(maxFetch)) {
				558	# Account for offset - we can only fetch from the remaining results after offset
				559	remaining_results_after_offset <- max(0, kqo@totalResults - offset)
				560	min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
				561	} else {
				562	total_pages
				563	}
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	564
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	565	eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	566
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	567	# Extract timing information for display
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	568	time_per_page <- NA
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	569	if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	570	time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	571	}
				572
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	573	# Create the page display string with proper formatting
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	574
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	575	# For global page tracking, calculate the absolute page number
				576	actual_display_number <- if (randomizePageOrder) {
				577	current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
				578	} else {
				579	# In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
				580	current_offset_page + 1
				581	}
				582
				583	# For subsequent calls to fetchNext, we need to calculate the correct page numbers
				584	# based on the current batch being fetched
				585
				586	# For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
				587	# Simply count from 1 within the current batch
				588
				589	# The relative page number is simply the current position in this batch
				590	if (randomizePageOrder) {
				591	relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
				592	} else {
				593	relative_page_number <- display_page_number - (page_count_start - 1)
				594	}
				595
				596	# How many pages will we fetch in this batch?
Marc Kupietz	021663d	2025-06-18 17:49:22 +0200	[diff] [blame]	597	# If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	598	pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz	021663d	2025-06-18 17:49:22 +0200	[diff] [blame]	599	# Account for offset - we can only fetch from the remaining results after offset
				600	remaining_results_after_offset <- max(0, kqo@totalResults - offset)
				601	min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	602	} else {
				603	# Otherwise fetch all remaining pages
				604	total_pages - page_count_start + 1
				605	}
				606
				607	# The total pages to be shown in this batch
				608	batch_total_pages <- pages_in_this_batch
				609
				610	page_display <- paste0(
				611	"Retrieved page ",
				612	sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
				613	"/",
				614	sprintf("%d", batch_total_pages)
				615	)
				616
				617	# If randomized, also show which actual page we fetched
				618	if (randomizePageOrder) {
				619	# Determine the maximum width needed for page numbers (based on total pages)
				620	# This ensures consistent alignment
				621	max_page_width <- nchar(as.character(total_pages))
				622	# Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	623	page_display <- paste0(
				624	page_display,
				625	sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
				626	)
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	627	}
				628	# Always show the absolute page number and total pages (for clarity)
				629	else {
				630	# Show the absolute page number (out of total possible pages)
				631	page_display <- paste0(page_display, sprintf(
				632	" (page %d of %d total)",
				633	actual_display_number, total_pages
				634	))
				635	}
				636
				637	# Add caching or timing information
				638	if (!is.null(res$meta$cached)) {
				639	page_display <- paste0(page_display, " [cached]")
				640	} else {
				641	page_display <- paste0(
				642	page_display,
				643	" in ",
				644	if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	645	"s",
				646	eta_info
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	647	)
				648	}
				649
				650	log_info(verbose, paste0(page_display, "\n"))
				651
				652	# Increment the appropriate counter based on mode
				653	if (randomizePageOrder) {
				654	page_index <- page_index + 1
				655	} else {
				656	current_page_number <- current_page_number + 1
				657	}
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	658	results <- results + res$meta$itemsPerPage
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	659	if (nrow(collectedMatches) >= kqo@totalResults \|\| (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	660	break
				661	}
				662	}
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	663	nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	664	KorAPQuery(
				665	nextStartIndex = nextStartIndex,
Marc Kupietz	d0d3e9b	2019-09-24 17:36:03 +0200	[diff] [blame]	666	korapConnection = kqo@korapConnection,
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	667	fields = kqo@fields,
				668	requestUrl = kqo@requestUrl,
				669	request = kqo@request,
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	670	totalResults = kqo@totalResults,
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	671	vc = kqo@vc,
				672	webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	673	hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	674	apiResponse = res,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	675	collectedMatches = collectedMatches
				676	)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	677	})
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	678
				679	#' Fetch all results of a KorAP query.
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	680	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	681	#' `fetchAll` fetches all results of a KorAP query.
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	682	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	683	#' @family corpus search functions
Marc Kupietz	dc880ac	2025-06-24 20:34:43 +0200	[diff] [blame]	684	#' @param kqo object obtained from [corpusQuery()]
				685	#' @param verbose print progress information if true
				686	#' @param ... further arguments passed to [fetchNext()]
				687	#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	688	#'
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	689	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	690	#' \dontrun{
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	691	#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
				692	#' q <- KorAPConnection() \|>
				693	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	694	#' fetchAll()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	695	#' q@collectedMatches
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	696	#'
				697	#' # Fetch also all KWICs
				698	#' q <- KorAPConnection() \|> auth() \|>
				699	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
				700	#' fetchAll()
				701	#' q@collectedMatches
				702	#'
				703	#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
				704	#' q <- KorAPConnection() \|>
				705	#' corpusQuery("<base/s=t>", # this matches each text once
				706	#' vc = "pubDate in 1958-03-12",
				707	#' fields = c("textSigle", "title"),
				708	#' ) \|>
				709	#' fetchAll()
				710	#' q@collectedMatches
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	711	#' }
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	712	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	713	#' @aliases fetchAll
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	714	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	715	setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
				716	return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	717	})
				718
				719	#' Fetches the remaining results of a KorAP query.
				720	#'
Marc Kupietz	dc880ac	2025-06-24 20:34:43 +0200	[diff] [blame]	721	#' @param kqo object obtained from [corpusQuery()]
				722	#' @param verbose print progress information if true
				723	#' @param ... further arguments passed to [fetchNext()]
				724	#' @return The updated `kqo` object with remaining results in `@collectedMatches`
				725	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	726	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	727	#' \dontrun{
				728	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	729	#' q <- KorAPConnection() \|>
				730	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	731	#' fetchRest()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	732	#' q@collectedMatches
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	733	#' }
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	734	#'
				735	#' @aliases fetchRest
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	736	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	737	setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
				738	return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	739	})
				740
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	741	#'
				742	#' Parse XML annotations into linguistic layers
				743	#'
				744	#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
				745	#' from XML annotation snippets returned by the KorAP API.
				746	#'
				747	#' @param xml_snippet XML string containing annotation data
				748	#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
				749	#' @keywords internal
				750	parse_xml_annotations <- function(xml_snippet) {
				751	if (is.null(xml_snippet) \|\| is.na(xml_snippet) \|\| xml_snippet == "") {
				752	return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
				753	}
				754
				755	# Extract content within <span class="match">...</span> using a more robust approach
				756	if (grepl('<span class="match">', xml_snippet)) {
				757	# Find the start of match span
				758	start_pos <- regexpr('<span class="match">', xml_snippet)
				759	if (start_pos > 0) {
				760	# Find the end by counting nested spans
				761	content_start <- start_pos + attr(start_pos, "match.length")
				762	remaining <- substr(xml_snippet, content_start, nchar(xml_snippet))
				763
				764	# Simple approach: extract everything until we hit context-right or end
				765	if (grepl('<span class="context-right">', remaining)) {
				766	content_to_parse <- gsub('(.?)<span class="context-right">.', '\\1', remaining)
				767	} else {
				768	# Find the closing </span> that matches our opening span
				769	# For now, use a simpler approach - take everything until the last </span> sequence
				770	content_to_parse <- gsub('(.)</span>\\s$', '\\1', remaining)
				771	}
				772	} else {
				773	content_to_parse <- xml_snippet
				774	}
				775	} else {
				776	content_to_parse <- xml_snippet
				777	}
				778
				779	# Initialize result vectors
				780	tokens <- character(0)
				781	lemmas <- character(0)
				782	pos_tags <- character(0)
				783	morph_tags <- character(0)
				784
				785	# Split the content by </span> and process each meaningful part
				786	parts <- unlist(strsplit(content_to_parse, '</span>'))
				787
				788	for (part in parts) {
				789	part <- trimws(part)
				790	if (nchar(part) == 0) next
				791
				792	# Look for parts that have title attributes and end with text
				793	if (grepl('<span[^>]*title=', part)) {
				794	# Extract the text content (everything after the last >)
				795	text_content <- gsub('.>([^<])$', '\\1', part)
				796	text_content <- trimws(text_content)
				797
				798	if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
				799	tokens <- c(tokens, text_content)
				800
				801	# Extract all title attributes from this part
				802	title_pattern <- 'title="([^"]*)"'
				803	title_matches <- gregexpr(title_pattern, part)
				804
				805	lemma <- NA
				806	pos_tag <- NA
				807	morph_tag <- NA
				808
				809	if (title_matches[[1]][1] != -1) {
				810	all_titles <- regmatches(part, title_matches)[[1]]
				811	for (title_match in all_titles) {
				812	title_content <- gsub(title_pattern, '\\1', title_match)
				813
				814	# Split by spaces and process each annotation
				815	annotations <- unlist(strsplit(title_content, "\\s+"))
				816	for (annotation in annotations) {
				817	if (grepl('^tt/l:', annotation)) {
				818	lemma <- gsub('^tt/l:(.*)$', '\\1', annotation)
				819	} else if (grepl('^tt/p:', annotation)) {
				820	pos_tag <- gsub('^tt/p:(.*)$', '\\1', annotation)
				821	} else if (grepl('^tt/m:', annotation)) {
				822	morph_tag <- gsub('^tt/m:(.*)$', '\\1', annotation)
				823	}
				824	}
				825	}
				826	}
				827
				828	lemmas <- c(lemmas, lemma)
				829	pos_tags <- c(pos_tags, pos_tag)
				830	morph_tags <- c(morph_tags, morph_tag)
				831	}
				832	}
				833	}
				834
				835	# If no tokens found with the splitting approach, try a different method
				836	if (length(tokens) == 0) {
				837	# Look for the innermost spans that contain actual text
				838	innermost_pattern <- '<span[^>]title="([^"])"[^>]*>([^<]+)</span>'
				839	innermost_matches <- gregexpr(innermost_pattern, content_to_parse, perl = TRUE)
				840
				841	if (innermost_matches[[1]][1] != -1) {
				842	matches <- regmatches(content_to_parse, innermost_matches)[[1]]
				843
				844	for (match in matches) {
				845	title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
				846	text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
				847	text <- trimws(text)
				848
				849	if (nchar(text) > 0) {
				850	tokens <- c(tokens, text)
				851
				852	# Parse space-separated annotations in title
				853	lemma <- NA
				854	pos_tag <- NA
				855	morph_tag <- NA
				856
				857	annotations <- unlist(strsplit(title, "\\s+"))
				858	for (annotation in annotations) {
				859	if (grepl('^tt/l:', annotation)) {
				860	lemma <- gsub('^tt/l:(.*)$', '\\1', annotation)
				861	} else if (grepl('^tt/p:', annotation)) {
				862	pos_tag <- gsub('^tt/p:(.*)$', '\\1', annotation)
				863	} else if (grepl('^tt/m:', annotation)) {
				864	morph_tag <- gsub('^tt/m:(.*)$', '\\1', annotation)
				865	}
				866	}
				867
				868	lemmas <- c(lemmas, lemma)
				869	pos_tags <- c(pos_tags, pos_tag)
				870	morph_tags <- c(morph_tags, morph_tag)
				871	}
				872	}
				873	}
				874	}
				875
				876	# Ensure all vectors have the same length
				877	max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
				878	if (max_length > 0) {
				879	tokens <- c(tokens, rep(NA, max_length - length(tokens)))
				880	lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
				881	pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
				882	morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
				883	}
				884
				885	return(list(
				886	token = tokens,
				887	lemma = lemmas,
				888	pos = pos_tags,
				889	morph = morph_tags
				890	))
				891	}
				892
				893	#'
				894	#' Parse XML annotations into linguistic layers with left/match/right structure
				895	#'
				896	#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
				897	#' from XML annotation snippets returned by the KorAP API, split into left context,
				898	#' match, and right context sections like the tokens field.
				899	#'
				900	#' @param xml_snippet XML string containing annotation data
				901	#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
				902	#' @keywords internal
				903	parse_xml_annotations_structured <- function(xml_snippet) {
				904	if (is.null(xml_snippet) \|\| is.na(xml_snippet) \|\| xml_snippet == "") {
				905	empty_result <- list(left = character(0), match = character(0), right = character(0))
				906	return(list(
				907	atokens = empty_result,
				908	lemma = empty_result,
				909	pos = empty_result,
				910	morph = empty_result
				911	))
				912	}
				913
				914	# Helper function to extract annotations from a span section
				915	extract_annotations_from_section <- function(section_content) {
				916	tokens <- character(0)
				917	lemmas <- character(0)
				918	pos_tags <- character(0)
				919	morph_tags <- character(0)
				920
				921	# Split the content by </span> and process each meaningful part
				922	parts <- unlist(strsplit(section_content, '</span>'))
				923
				924	for (part in parts) {
				925	part <- trimws(part)
				926	if (nchar(part) == 0) next
				927
				928	# Look for parts that have title attributes and end with text
				929	if (grepl('<span[^>]*title=', part)) {
				930	# Extract the text content (everything after the last >)
				931	text_content <- gsub('.>([^<])$', '\\1', part)
				932	text_content <- trimws(text_content)
				933
				934	if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
				935	tokens <- c(tokens, text_content)
				936
				937	# Extract all title attributes from this part
				938	title_pattern <- 'title="([^"]*)"'
				939	title_matches <- gregexpr(title_pattern, part)
				940
				941	lemma <- NA
				942	pos_tag <- NA
				943	morph_tag <- NA
				944
				945	if (title_matches[[1]][1] != -1) {
				946	all_titles <- regmatches(part, title_matches)[[1]]
				947	for (title_match in all_titles) {
				948	title_content <- gsub(title_pattern, '\\1', title_match)
				949
				950	# Split by spaces and process each annotation
				951	annotations <- unlist(strsplit(title_content, "\\s+"))
				952	for (annotation in annotations) {
				953	if (grepl('^tt/l:', annotation)) {
				954	lemma <- gsub('^tt/l:(.*)$', '\\1', annotation)
				955	} else if (grepl('^tt/p:', annotation)) {
				956	pos_tag <- gsub('^tt/p:(.*)$', '\\1', annotation)
				957	} else if (grepl('^tt/m:', annotation)) {
				958	morph_tag <- gsub('^tt/m:(.*)$', '\\1', annotation)
				959	}
				960	}
				961	}
				962	}
				963
				964	lemmas <- c(lemmas, lemma)
				965	pos_tags <- c(pos_tags, pos_tag)
				966	morph_tags <- c(morph_tags, morph_tag)
				967	}
				968	}
				969	}
				970
				971	# If no tokens found with the splitting approach, try a different method
				972	if (length(tokens) == 0) {
				973	# Look for the innermost spans that contain actual text
				974	innermost_pattern <- '<span[^>]title="([^"])"[^>]*>([^<]+)</span>'
				975	innermost_matches <- gregexpr(innermost_pattern, section_content, perl = TRUE)
				976
				977	if (innermost_matches[[1]][1] != -1) {
				978	matches <- regmatches(section_content, innermost_matches)[[1]]
				979
				980	for (match in matches) {
				981	title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
				982	text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
				983	text <- trimws(text)
				984
				985	if (nchar(text) > 0) {
				986	tokens <- c(tokens, text)
				987
				988	# Parse space-separated annotations in title
				989	lemma <- NA
				990	pos_tag <- NA
				991	morph_tag <- NA
				992
				993	annotations <- unlist(strsplit(title, "\\s+"))
				994	for (annotation in annotations) {
				995	if (grepl('^tt/l:', annotation)) {
				996	lemma <- gsub('^tt/l:(.*)$', '\\1', annotation)
				997	} else if (grepl('^tt/p:', annotation)) {
				998	pos_tag <- gsub('^tt/p:(.*)$', '\\1', annotation)
				999	} else if (grepl('^tt/m:', annotation)) {
				1000	morph_tag <- gsub('^tt/m:(.*)$', '\\1', annotation)
				1001	}
				1002	}
				1003
				1004	lemmas <- c(lemmas, lemma)
				1005	pos_tags <- c(pos_tags, pos_tag)
				1006	morph_tags <- c(morph_tags, morph_tag)
				1007	}
				1008	}
				1009	}
				1010	}
				1011
				1012	# Ensure all vectors have the same length
				1013	max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
				1014	if (max_length > 0) {
				1015	tokens <- c(tokens, rep(NA, max_length - length(tokens)))
				1016	lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
				1017	pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
				1018	morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
				1019	}
				1020
				1021	return(list(
				1022	tokens = tokens,
				1023	lemmas = lemmas,
				1024	pos_tags = pos_tags,
				1025	morph_tags = morph_tags
				1026	))
				1027	}
				1028
				1029	# Split the XML into three parts: left context, match content, and right context
				1030	# The structure is: <span class="match">...left...<mark>...match...</mark>...right...</span>
				1031
				1032	# First extract the content within the match span using DOTALL modifier
				1033	match_span_pattern <- '(?s)<span class="match">(.?)</span>\\s<span class="context-right">'
				1034	match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
				1035
				1036	if (match_span_match == -1) {
				1037	# Try alternative pattern if no context-right
				1038	match_span_pattern <- '(?s)<span class="match">(.?)</span>\\s$'
				1039	match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
				1040	}
				1041
				1042	if (match_span_match > 0) {
				1043	match_span_content <- gsub(match_span_pattern, '\\1', xml_snippet, perl = TRUE)
				1044
				1045	# Now find the <mark> and </mark> positions within this content
				1046	mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
				1047	mark_end <- regexpr('</mark>', match_span_content, perl = TRUE)
				1048
				1049	if (mark_start > 0 && mark_end > 0) {
				1050	# Left context: everything before <mark>
				1051	left_content <- substr(match_span_content, 1, mark_start - 1)
				1052
				1053	# Match content: everything between <mark> and </mark> (including the mark tags for now)
				1054	match_content <- substr(match_span_content, mark_start, mark_end + attr(mark_end, "match.length") - 1)
				1055
				1056	# Right context: everything after </mark>
				1057	right_content_start <- mark_end + attr(mark_end, "match.length")
				1058	right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
				1059	} else {
				1060	# No mark tags found, treat entire match span as match content
				1061	left_content <- ""
				1062	match_content <- match_span_content
				1063	right_content <- ""
				1064	}
				1065	} else {
				1066	# No match span found, treat entire content as match
				1067	left_content <- ""
				1068	match_content <- xml_snippet
				1069	right_content <- ""
				1070	}
				1071
				1072	# Process each section
				1073	left_annotations <- extract_annotations_from_section(left_content)
				1074	match_annotations <- extract_annotations_from_section(match_content)
				1075	right_annotations <- extract_annotations_from_section(right_content)
				1076
				1077	return(list(
				1078	atokens = list(
				1079	left = left_annotations$tokens,
				1080	match = match_annotations$tokens,
				1081	right = right_annotations$tokens
				1082	),
				1083	lemma = list(
				1084	left = left_annotations$lemmas,
				1085	match = match_annotations$lemmas,
				1086	right = right_annotations$lemmas
				1087	),
				1088	pos = list(
				1089	left = left_annotations$pos_tags,
				1090	match = match_annotations$pos_tags,
				1091	right = right_annotations$pos_tags
				1092	),
				1093	morph = list(
				1094	left = left_annotations$morph_tags,
				1095	match = match_annotations$morph_tags,
				1096	right = right_annotations$morph_tags
				1097	)
				1098	))
				1099	}
				1100
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1101	#' Fetch annotations for all collected matches
				1102	#'
				1103	#' `fetchAnnotations` fetches annotations for all matches in the `@collectedMatches` slot
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1104	#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
				1105	#' data frame. The method automatically uses the `matchID` from collected matches when
				1106	#' available for safer and more reliable annotation retrieval, falling back to constructing
				1107	#' URLs from `matchStart` and `matchEnd` if necessary.
				1108	#'
				1109	#' Important: For copyright-restricted corpora, users must be authorized via [auth()]
				1110	#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
				1111	#' available for annotation parsing.
				1112	#'
				1113	#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
				1114	#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
				1115	#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
				1116	#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
				1117	#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
				1118	#' - `annotation_snippet`: original XML snippet from the annotation API
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1119	#'
				1120	#' @family corpus search functions
				1121	#' @aliases fetchAnnotations
				1122	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1123	#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1124	#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
				1125	#' @param verbose print progress information if true
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1126	#' @return The updated `kqo` object with annotation columns added to `@collectedMatches`
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1127	#'
				1128	#' @examples
				1129	#' \dontrun{
				1130	#'
				1131	#' # Fetch annotations for matches using Tree-Tagger foundry
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1132	#' # Note: Authorization required for copyright-restricted corpora
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1133	#' q <- KorAPConnection() \|>
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1134	#' auth() \|>
				1135	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1136	#' fetchNext(maxFetch = 10) \|>
				1137	#' fetchAnnotations()
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1138	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1139	#' # Access linguistic annotations for match i:
				1140	#' pos_tags <- q@collectedMatches$pos # Data frame with left/match/right columns for POS tags
				1141	#' lemmas <- q@collectedMatches$lemma # Data frame with left/match/right columns for lemmas
				1142	#' morphology <- q@collectedMatches$morph # Data frame with left/match/right columns for morphological tags
				1143	#' atokens <- q@collectedMatches$atokens # Data frame with left/match/right columns for annotation token text
				1144	#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]] # Original XML snippet for match i
				1145	#'
				1146	#' # Access specific components:
				1147	#' match_pos <- q@collectedMatches$pos$match[[i]] # POS tags for the matched tokens in match i
				1148	#' left_lemmas <- q@collectedMatches$lemma$left[[i]] # Lemmas for the left context in match i
				1149	#' right_tokens <- q@collectedMatches$atokens$right[[i]] # Token text for the right context in match i
				1150	#'
				1151	#' # Use a different foundry (e.g., mate-parser)
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1152	#' q <- KorAPConnection() \|>
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1153	#' auth() \|>
				1154	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1155	#' fetchNext(maxFetch = 10) \|>
				1156	#' fetchAnnotations(foundry = "mate")
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1157	#' q@collectedMatches
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1158	#' }
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1159	#' @export
				1160	setMethod("fetchAnnotations", "KorAPQuery", function(kqo, foundry = "tt", verbose = kqo@korapConnection@verbose) {
				1161	if (is.null(kqo@collectedMatches) \|\| nrow(kqo@collectedMatches) == 0) {
				1162	warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
				1163	return(kqo)
				1164	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1165
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1166	df <- kqo@collectedMatches
				1167	kco <- kqo@korapConnection
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1168
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1169	if (verbose) {
				1170	cat("Fetching annotations for", nrow(df), "matches using foundry:", foundry, "\n")
				1171	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1172
				1173	# Initialize annotation columns as data frames (like tokens field)
				1174	# Create the structure more explicitly to avoid assignment issues
				1175	nrows <- nrow(df)
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1176
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1177	df$pos <- data.frame(
				1178	left = I(replicate(nrows, character(0), simplify = FALSE)),
				1179	match = I(replicate(nrows, character(0), simplify = FALSE)),
				1180	right = I(replicate(nrows, character(0), simplify = FALSE)),
				1181	stringsAsFactors = FALSE
				1182	)
				1183
				1184	df$lemma <- data.frame(
				1185	left = I(replicate(nrows, character(0), simplify = FALSE)),
				1186	match = I(replicate(nrows, character(0), simplify = FALSE)),
				1187	right = I(replicate(nrows, character(0), simplify = FALSE)),
				1188	stringsAsFactors = FALSE
				1189	)
				1190
				1191	df$morph <- data.frame(
				1192	left = I(replicate(nrows, character(0), simplify = FALSE)),
				1193	match = I(replicate(nrows, character(0), simplify = FALSE)),
				1194	right = I(replicate(nrows, character(0), simplify = FALSE)),
				1195	stringsAsFactors = FALSE
				1196	)
				1197
				1198	df$atokens <- data.frame(
				1199	left = I(replicate(nrows, character(0), simplify = FALSE)),
				1200	match = I(replicate(nrows, character(0), simplify = FALSE)),
				1201	right = I(replicate(nrows, character(0), simplify = FALSE)),
				1202	stringsAsFactors = FALSE
				1203	)
				1204
				1205	df$annotation_snippet <- replicate(nrows, NA, simplify = FALSE)
				1206
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1207	for (i in seq_len(nrow(df))) {
				1208	if (verbose && i %% 10 == 0) {
				1209	cat("Processing match", i, "of", nrow(df), "\n")
				1210	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1211
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1212	# Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
				1213	if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1214	# matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
				1215	# "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
				1216	# Extract document path and position, handling both regular and encrypted formats
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1217
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1218	# More flexible regex to extract the document path with position and encryption
				1219	# Look for pattern: match-(...)-p(\d+)-(\d+)(.) where (.) is the encrypted part
				1220	# We need to capture the entire path including the encrypted suffix
				1221	match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
				1222
				1223	if (match_result > 0) {
				1224	# Extract the complete path including encryption (everything after "match-")
				1225	doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
				1226	# Convert the dash before position to slash, but keep everything after the position
				1227	match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
				1228	req <- paste0(kco@apiUrl, "corpus/", match_path, "?foundry=", foundry)
				1229
				1230	if (verbose) {
				1231	cat("Using matchID approach for match", i, ": matchID =", df$matchID[i], "\n")
				1232	cat("Extracted doc path with encryption:", doc_path_with_pos_and_encryption, "\n")
				1233	cat("Final match path:", match_path, "\n")
				1234	cat("Constructed URL:", req, "\n")
				1235	}
				1236	} else {
				1237	# If regex fails, fall back to the old method
				1238	if (verbose) {
				1239	cat("Failed to parse matchID format:", df$matchID[i], "\n")
				1240	cat("Falling back to textSigle + position method\n")
				1241	}
				1242	req <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", df$matchStart[i], "-", df$matchEnd[i], "?foundry=", foundry)
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1243	}
				1244	} else {
				1245	# Fallback to the old method
				1246	req <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", df$matchStart[i], "-", df$matchEnd[i], "?foundry=", foundry)
				1247	if (verbose) {
				1248	cat("Using fallback approach for match", i, ": textSigle =", df$textSigle[i], "\n")
				1249	cat("Constructed URL:", req, "\n")
				1250	}
				1251	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1252
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1253	tryCatch({
				1254	res <- apiCall(kco, req)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1255	if (verbose) {
				1256	cat("API call result for match", i, ":\n")
				1257	if (is.null(res)) {
				1258	cat(" Result is NULL\n")
				1259	} else {
				1260	cat(" Result class:", class(res), "\n")
				1261	if (is.list(res)) {
				1262	cat(" Result names:", paste(names(res), collapse = ", "), "\n")
				1263	if ("snippet" %in% names(res)) {
				1264	snippet_length <- if (is.character(res$snippet)) nchar(res$snippet) else "not character"
				1265	cat(" Snippet length:", snippet_length, "\n")
				1266	if (is.character(res$snippet) && nchar(res$snippet) > 0) {
				1267	cat(" Snippet preview:", substr(res$snippet, 1, 100), "...\n")
				1268	}
				1269	}
				1270	}
				1271	}
				1272	}
				1273
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1274	if (!is.null(res)) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1275	# Store the raw annotation snippet
				1276	df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
				1277
				1278	# Parse XML annotations if snippet is available
				1279	if (is.list(res) && "snippet" %in% names(res)) {
				1280	parsed_annotations <- parse_xml_annotations_structured(res$snippet)
				1281
				1282	# Store the parsed linguistic data in data frame format (like tokens)
				1283	# Use individual assignment to avoid data frame mismatch errors
				1284	tryCatch({
				1285	# Assign POS annotations
				1286	df$pos$left[i] <- list(parsed_annotations$pos$left)
				1287	df$pos$match[i] <- list(parsed_annotations$pos$match)
				1288	df$pos$right[i] <- list(parsed_annotations$pos$right)
				1289
				1290	# Assign lemma annotations
				1291	df$lemma$left[i] <- list(parsed_annotations$lemma$left)
				1292	df$lemma$match[i] <- list(parsed_annotations$lemma$match)
				1293	df$lemma$right[i] <- list(parsed_annotations$lemma$right)
				1294
				1295	# Assign morphology annotations
				1296	df$morph$left[i] <- list(parsed_annotations$morph$left)
				1297	df$morph$match[i] <- list(parsed_annotations$morph$match)
				1298	df$morph$right[i] <- list(parsed_annotations$morph$right)
				1299
				1300	# Assign token annotations
				1301	df$atokens$left[i] <- list(parsed_annotations$atokens$left)
				1302	df$atokens$match[i] <- list(parsed_annotations$atokens$match)
				1303	df$atokens$right[i] <- list(parsed_annotations$atokens$right)
				1304	}, error = function(assign_error) {
				1305	if (verbose) {
				1306	cat("Error assigning annotations for match", i, ":", assign_error$message, "\n")
				1307	cat("Setting empty values instead\n")
				1308	}
				1309	# Set empty character vectors on assignment error using list assignment
				1310	df$pos$left[i] <<- list(character(0))
				1311	df$pos$match[i] <<- list(character(0))
				1312	df$pos$right[i] <<- list(character(0))
				1313
				1314	df$lemma$left[i] <<- list(character(0))
				1315	df$lemma$match[i] <<- list(character(0))
				1316	df$lemma$right[i] <<- list(character(0))
				1317
				1318	df$morph$left[i] <<- list(character(0))
				1319	df$morph$match[i] <<- list(character(0))
				1320	df$morph$right[i] <<- list(character(0))
				1321
				1322	df$atokens$left[i] <<- list(character(0))
				1323	df$atokens$match[i] <<- list(character(0))
				1324	df$atokens$right[i] <<- list(character(0))
				1325	})
				1326
				1327	if (verbose && i <= 3) { # Show details for first few matches
				1328	cat("Match", i, "parsed annotations:\n")
				1329	cat(" Left tokens:", length(parsed_annotations$atokens$left), "\n")
				1330	cat(" Match tokens:", length(parsed_annotations$atokens$match), "\n")
				1331	cat(" Right tokens:", length(parsed_annotations$atokens$right), "\n")
				1332	if (length(parsed_annotations$pos$match) > 0 && any(!is.na(parsed_annotations$pos$match))) {
				1333	cat(" Match POS tags:", paste(parsed_annotations$pos$match, collapse = ", "), "\n")
				1334	}
				1335	if (length(parsed_annotations$lemma$match) > 0 && any(!is.na(parsed_annotations$lemma$match))) {
				1336	cat(" Match lemmas:", paste(parsed_annotations$lemma$match, collapse = ", "), "\n")
				1337	}
				1338	if (length(parsed_annotations$morph$match) > 0 && any(!is.na(parsed_annotations$morph$match))) {
				1339	cat(" Match morph tags:", paste(parsed_annotations$morph$match, collapse = ", "), "\n")
				1340	}
				1341	}
				1342	} else {
				1343	# No snippet available, store empty vectors
				1344	df$pos$left[i] <- list(character(0))
				1345	df$pos$match[i] <- list(character(0))
				1346	df$pos$right[i] <- list(character(0))
				1347
				1348	df$lemma$left[i] <- list(character(0))
				1349	df$lemma$match[i] <- list(character(0))
				1350	df$lemma$right[i] <- list(character(0))
				1351
				1352	df$morph$left[i] <- list(character(0))
				1353	df$morph$match[i] <- list(character(0))
				1354	df$morph$right[i] <- list(character(0))
				1355
				1356	df$atokens$left[i] <- list(character(0))
				1357	df$atokens$match[i] <- list(character(0))
				1358	df$atokens$right[i] <- list(character(0))
				1359	}
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1360	} else {
				1361	if (verbose) {
				1362	cat("Warning: No annotations returned for match", i, "\n")
				1363	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1364	# Store NAs for failed requests
				1365	df$pos$left[i] <- list(NA)
				1366	df$pos$match[i] <- list(NA)
				1367	df$pos$right[i] <- list(NA)
				1368
				1369	df$lemma$left[i] <- list(NA)
				1370	df$lemma$match[i] <- list(NA)
				1371	df$lemma$right[i] <- list(NA)
				1372
				1373	df$morph$left[i] <- list(NA)
				1374	df$morph$match[i] <- list(NA)
				1375	df$morph$right[i] <- list(NA)
				1376
				1377	df$atokens$left[i] <- list(NA)
				1378	df$atokens$match[i] <- list(NA)
				1379	df$atokens$right[i] <- list(NA)
				1380	df$annotation_snippet[[i]] <- NA
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1381	}
				1382	}, error = function(e) {
				1383	if (verbose) {
				1384	cat("Error fetching annotations for match", i, ":", e$message, "\n")
				1385	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1386	# Store NAs for failed requests
				1387	df$pos$left[i] <- list(NA)
				1388	df$pos$match[i] <- list(NA)
				1389	df$pos$right[i] <- list(NA)
				1390
				1391	df$lemma$left[i] <- list(NA)
				1392	df$lemma$match[i] <- list(NA)
				1393	df$lemma$right[i] <- list(NA)
				1394
				1395	df$morph$left[i] <- list(NA)
				1396	df$morph$match[i] <- list(NA)
				1397	df$morph$right[i] <- list(NA)
				1398
				1399	df$atokens$left[i] <- list(NA)
				1400	df$atokens$match[i] <- list(NA)
				1401	df$atokens$right[i] <- list(NA)
				1402	df$annotation_snippet[[i]] <- NA
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1403	})
				1404	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1405
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1406	if (verbose) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1407	successful_annotations <- sum(!is.na(df$annotation_snippet))
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1408	cat("Successfully fetched annotations for", successful_annotations, "of", nrow(df), "matches\n")
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1409	cat("Linguistic data stored as columns in collectedMatches\n")
				1410	cat("Data frame dimensions before assignment:", nrow(df), "x", ncol(df), "\n")
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1411	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1412
				1413	# Validate data frame structure before assignment
				1414	if (nrow(df) != nrow(kqo@collectedMatches)) {
				1415	if (verbose) {
				1416	cat("Warning: Row count mismatch. Original:", nrow(kqo@collectedMatches), "Modified:", nrow(df), "\n")
				1417	}
				1418	}
				1419
				1420	# Update the collectedMatches with annotation data
				1421	tryCatch({
				1422	kqo@collectedMatches <- df
				1423	}, error = function(assign_error) {
				1424	if (verbose) {
				1425	cat("Error updating collectedMatches:", assign_error$message, "\n")
				1426	cat("Attempting to preserve original data and add annotations separately\n")
				1427	}
				1428	# Try a safer approach: add columns individually
				1429	tryCatch({
				1430	kqo@collectedMatches$pos <- df$pos
				1431	kqo@collectedMatches$lemma <- df$lemma
				1432	kqo@collectedMatches$morph <- df$morph
				1433	kqo@collectedMatches$atokens <- df$atokens
				1434	kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
				1435	}, error = function(col_error) {
				1436	if (verbose) {
				1437	cat("Error adding annotation columns:", col_error$message, "\n")
				1438	}
				1439	warning("Failed to add annotation data to collectedMatches")
				1440	})
				1441	})
				1442
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1443	return(kqo)
				1444	})
				1445
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1446	#' Query frequencies of search expressions in virtual corpora
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1447	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	1448	#' `frequencyQuery` combines [corpusQuery()], [corpusStats()] and
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1449	#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1450	#' confidence intervals of one ore multiple search terms across one or multiple
				1451	#' virtual corpora.
				1452	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	1453	#' @family frequency analysis
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1454	#' @aliases frequencyQuery
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1455	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	1456	#' \dontrun{
				1457	#'
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1458	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1459	#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	1460	#' }
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1461	#'
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1462	# @inheritParams corpusQuery
Marc Kupietz	617266d	2025-02-27 10:43:07 +0100	[diff] [blame]	1463	#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1464	#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
				1465	#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	1466	#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
				1467	#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1468	#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1469	#' @export
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1470	#'
				1471	#' @return A tibble, with each row containing the following result columns for query and vc combinations:
				1472	#' - query: the query string used for the frequency analysis.
				1473	#' - totalResults: absolute frequency of query matches in the vc.
				1474	#' - vc: virtual corpus used for the query.
				1475	#' - webUIRequestUrl: URL of the corresponding web UI request with respect to query and vc.
				1476	#' - total: total number of words in vc.
				1477	#' - f: relative frequency of query matches in the vc.
				1478	#' - conf.low: lower bound of the confidence interval for the relative frequency, given `conf.level`.
				1479	#' - conf.high: upper bound of the confidence interval for the relative frequency, given `conf.level`.
				1480
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1481	setMethod(
				1482	"frequencyQuery", "KorAPConnection",
Marc Kupietz	71d6e05	2019-11-22 18:42:10 +0100	[diff] [blame]	1483	function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1484	(if (as.alternatives) {
				1485	corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) \|>
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	1486	group_by(vc) \|>
Marc Kupietz	71d6e05	2019-11-22 18:42:10 +0100	[diff] [blame]	1487	mutate(total = sum(totalResults))
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1488	} else {
				1489	corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) \|>
				1490	mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	1491	}) \|>
Marc Kupietz	0c29cea	2019-10-09 08:44:36 +0200	[diff] [blame]	1492	ci(conf.level = conf.level)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1493	}
				1494	)
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1495
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1496	#' buildWebUIRequestUrlFromString
				1497	#'
				1498	#' @rdname KorAPQuery-class
				1499	#' @importFrom urltools url_encode
				1500	#' @export
				1501	buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1502	query,
				1503	vc = "",
				1504	ql = "poliqarp") {
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1505	if ("KorAPConnection" %in% class(KorAPUrl)) {
				1506	KorAPUrl <- KorAPUrl@KorAPUrl
				1507	}
				1508
				1509	request <-
				1510	paste0(
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1511	"?q=",
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1512	urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1513	ifelse(vc != "",
				1514	paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
				1515	""
				1516	),
				1517	"&ql=",
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1518	ql
				1519	)
				1520	paste0(KorAPUrl, request)
				1521	}
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1522
				1523	#' buildWebUIRequestUrl
				1524	#'
				1525	#' @rdname KorAPQuery-class
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1526	#' @importFrom httr2 url_parse
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1527	#' @export
				1528	buildWebUIRequestUrl <- function(kco,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1529	query = if (missing(KorAPUrl)) {
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1530	stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1531	} else {
				1532	httr2::url_parse(KorAPUrl)$query$q
				1533	},
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1534	vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1535	KorAPUrl,
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1536	ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1537	buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1538	}
				1539
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1540	#' format()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1541	#' @rdname KorAPQuery-class
				1542	#' @param x KorAPQuery object
				1543	#' @param ... further arguments passed to or from other methods
Marc Kupietz	b73ca0f	2025-01-28 20:45:01 +0100	[diff] [blame]	1544	#' @importFrom urltools param_get url_decode
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1545	#' @export
				1546	format.KorAPQuery <- function(x, ...) {
				1547	cat("<KorAPQuery>\n")
				1548	q <- x
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1549	param <- urltools::param_get(q@request) \|> lapply(urltools::url_decode)
Marc Kupietz	b73ca0f	2025-01-28 20:45:01 +0100	[diff] [blame]	1550	cat(" Query: ", param$q, "\n")
				1551	if (!is.null(param$cq) && param$cq != "") {
				1552	cat(" Virtual corpus: ", param$cq, "\n")
				1553	}
				1554	if (!is.null(q@collectedMatches)) {
				1555	cat("==============================================================================================================", "\n")
				1556	print(summary(q@collectedMatches))
				1557	cat("==============================================================================================================", "\n")
				1558	}
				1559	cat(" Total results: ", q@totalResults, "\n")
				1560	cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame^]	1561	if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
				1562	successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
				1563	parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
				1564	cat(" Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
				1565	if (parsed_annotations > 0) {
				1566	cat(" (", parsed_annotations, " with parsed linguistic data)")
				1567	}
				1568	cat("\n")
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1569	}
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1570	}
				1571
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1572	#' show()
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1573	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1574	#' @rdname KorAPQuery-class
				1575	#' @param object KorAPQuery object
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1576	#' @export
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1577	setMethod("show", "KorAPQuery", function(object) {
				1578	format(object)
				1579	})