Blame - R/KorAPQuery.R - KorAP/RKorAPClient

blob: e43467d37a76abcff4195cefbd0899339177beb4 [file] [log] [blame]

Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	1	#' KorAPQuery class (internal)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	2	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	3	#' Internal class for query state management. Users work with `corpusQuery()`, `fetchAll()`, and `fetchNext()` instead.
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	4	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	5	#' @keywords internal
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	6	#' @include KorAPConnection.R
Marc Kupietz	6dfeed9	2025-06-03 11:58:06 +0200	[diff] [blame]	7	#' @include logging.R
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	8	#' @import httr2
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	9	#'
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	10	#' @include RKorAPClient-package.R
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	11
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	12	#' @export
				13	KorAPQuery <- setClass("KorAPQuery", slots = c(
Marc Kupietz	b897218	2019-09-20 21:33:46 +0200	[diff] [blame]	14	"korapConnection",
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	15	"request",
				16	"vc",
				17	"totalResults",
				18	"nextStartIndex",
				19	"fields",
				20	"requestUrl",
				21	"webUIRequestUrl",
				22	"apiResponse",
				23	"collectedMatches",
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	24	"hasMoreMatches"
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	25	))
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	26
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	27	#' Initialize KorAPQuery object
				28	#' @keywords internal
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	29	#' @param .Object …
Marc Kupietz	b897218	2019-09-20 21:33:46 +0200	[diff] [blame]	30	#' @param korapConnection KorAPConnection object
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	31	#' @param request query part of the request URL
				32	#' @param vc definition of a virtual corpus
				33	#' @param totalResults number of hits the query has yielded
				34	#' @param nextStartIndex at what index to start the next fetch of query results
				35	#' @param fields what data / metadata fields should be collected
				36	#' @param requestUrl complete URL of the API request
				37	#' @param webUIRequestUrl URL of a web frontend request corresponding to the API request
				38	#' @param apiResponse data-frame representation of the JSON response of the API request
Marc Kupietz	7776dec	2019-09-27 16:59:02 +0200	[diff] [blame]	39	#' @param hasMoreMatches logical that signals if more query results can be fetched
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	40	#' @param collectedMatches matches already fetched from the KorAP-API-server
Marc Kupietz	97a1bca	2019-10-04 22:52:09 +0200	[diff] [blame]	41	#'
				42	#' @importFrom tibble tibble
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	43	#' @export
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	44	setMethod(
				45	"initialize", "KorAPQuery",
				46	function(.Object, korapConnection = NULL, request = NULL, vc = "", totalResults = 0, nextStartIndex = 0, fields = c(
				47	"corpusSigle", "textSigle", "pubDate", "pubPlace",
				48	"availability", "textClass", "snippet", "tokens"
				49	),
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	50	requestUrl = "", webUIRequestUrl = "", apiResponse = NULL, hasMoreMatches = FALSE, collectedMatches = NULL) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	51	.Object <- callNextMethod()
				52	.Object@korapConnection <- korapConnection
				53	.Object@request <- request
				54	.Object@vc <- vc
				55	.Object@totalResults <- totalResults
				56	.Object@nextStartIndex <- nextStartIndex
				57	.Object@fields <- fields
				58	.Object@requestUrl <- requestUrl
				59	.Object@webUIRequestUrl <- webUIRequestUrl
				60	.Object@apiResponse <- apiResponse
				61	.Object@hasMoreMatches <- hasMoreMatches
				62	.Object@collectedMatches <- collectedMatches
				63	.Object
				64	}
				65	)
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	66
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	67	setGeneric("corpusQuery", function(kco, ...) standardGeneric("corpusQuery"))
				68	setGeneric("fetchAll", function(kqo, ...) standardGeneric("fetchAll"))
				69	setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext"))
				70	setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest"))
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	71	setGeneric(
				72	"fetchAnnotations",
				73	function(kqo,
				74	foundry = "tt",
				75	overwrite = FALSE,
				76	verbose = kqo@korapConnection@verbose) standardGeneric("fetchAnnotations")
				77	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	78	setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery"))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	79
				80	maxResultsPerPage <- 50
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	81
Marc Kupietz	4de53ec	2019-10-04 09:12:00 +0200	[diff] [blame]	82	## quiets concerns of R CMD check re: the .'s that appear in pipelines
Marc Kupietz	ef1ef4a	2025-02-19 12:12:40 +0100	[diff] [blame]	83	utils::globalVariables(c("."))
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	84
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	85	#' Search corpus for query terms
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	86	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	87	#' `corpusQuery` performs a corpus query via a connection to a KorAP-API-server
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	88	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	89	#' @family corpus search functions
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	90	#' @aliases corpusQuery
				91	#'
				92	#' @importFrom urltools url_encode
				93	#' @importFrom purrr pmap
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	94	#' @importFrom dplyr bind_rows group_by
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	95	#'
Marc Kupietz	617266d	2025-02-27 10:43:07 +0100	[diff] [blame]	96	#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	97	#' @param query string that contains the corpus query. The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	98	#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	99	#' @param KorAPUrl instead of providing the query and vc string parameters, you can also simply copy a KorAP query URL from your browser and use it here (and in `KorAPConnection`) to provide all necessary information for the query.
Marc Kupietz	132f005	2023-04-16 14:23:05 +0200	[diff] [blame]	100	#' @param metadataOnly logical that determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. Note that the default value is TRUE.
				101	#' If you want your corpus queries to return not only metadata, but also KWICS, you need to authorize
				102	#' your RKorAPClient application as explained in the
				103	#' [authorization section](https://github.com/KorAP/RKorAPClient#authorization)
				104	#' of the RKorAPClient Readme on GitHub and set the `metadataOnly` parameter to
				105	#' `FALSE`.
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	106	#' @param ql string to choose the query language (see [section on Query Parameters](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET#user-content-parameters) in the Kustvakt-Wiki for possible values.
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	107	#' @param fields character vector specifying which metadata fields to retrieve for each match.
				108	#' Available fields depend on the corpus. For DeReKo (German Reference Corpus), possible fields include:
				109	#' \describe{
				110	#' \item{Text identification:}{`textSigle`, `docSigle`, `corpusSigle` - hierarchical text identifiers}
				111	#' \item{Publication info:}{`author`, `editor`, `title`, `docTitle`, `corpusTitle` - authorship and titles}
				112	#' \item{Temporal data:}{`pubDate`, `creationDate` - when text was published/created}
				113	#' \item{Publication details:}{`pubPlace`, `publisher`, `reference` - where/how published}
				114	#' \item{Text classification:}{`textClass`, `textType`, `textTypeArt`, `textDomain`, `textColumn` - topic domain, genre, text type and column}
				115	#' \item{Adminstrative and technical info:}{`corpusEditor`, `availability`, `language`, `foundries` - access rights and annotations}
				116	#' \item{Content data:}{`snippet`, `tokens`, `tokenSource`, `externalLink` - actual text content, tokenization, and link to source text}
				117	#' \item{System data:}{`indexCreationDate`, `indexLastModified` - corpus indexing info}
				118	#' }
				119	#' Use `c("textSigle", "pubDate", "author")` to retrieve multiple fields.
				120	#' Default fields provide basic text identification and publication metadata. The actual text content (`snippet` and `tokens`) are activated by default if `metadataOnly` is set to `FALSE`.
Marc Kupietz	43a6ade	2020-02-18 17:01:44 +0100	[diff] [blame]	121	#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented).
Marc Kupietz	25aebc3	2019-09-16 18:40:50 +0200	[diff] [blame]	122	#' @param verbose print some info
Marc Kupietz	4de53ec	2019-10-04 09:12:00 +0200	[diff] [blame]	123	#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	124	#' @param expand logical that decides if `query` and `vc` parameters are expanded to all of their combinations. Defaults to `TRUE`, iff `query` and `vc` have different lengths
Marc Kupietz	d9b2fd7	2023-04-17 19:08:50 +0200	[diff] [blame]	125	#' @param context string that specifies the size of the left and the right context returned in `snippet`
				126	#' (provided that `metadataOnly` is set to `false` and that the necessary access right are met).
				127	#' The format of the context size specifcation (e.g. `3-token,3-token`) is described in the [Service: Search GET documentation of the Kustvakt Wiki](https://github.com/KorAP/Kustvakt/wiki/Service:-Search-GET).
				128	#' If the parameter is not set, the default context size secification of the KorAP server instance will be used.
				129	#' Note that you cannot overrule the maximum context size set in the KorAP server instance,
				130	#' as this is typically legally motivated.
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	131	#' @return Depending on the `as.df` parameter, a tibble or a [KorAPQuery()] object that, among other information, contains the total number of results in `@totalResults`. The resulting object can be used to fetch all query results (with [fetchAll()]) or the next page of results (with [fetchNext()]).
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	132	#' A corresponding URL to be used within a web browser is contained in `@webUIRequestUrl`
				133	#' Please make sure to check `$collection$rewrites` to see if any unforeseen access rewrites of the query's virtual corpus had to be performed.
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	134	#'
				135	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	136	#' \dontrun{
				137	#'
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	138	#' # Fetch basic metadata for "Ameisenplage"
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	139	#' KorAPConnection() \|>
				140	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	141	#' fetchAll()
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	142	#'
				143	#' # Fetch specific metadata fields for bibliographic analysis
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	144	#' query <- KorAPConnection() \|>
Marc Kupietz	1623fe8	2025-06-24 16:31:46 +0200	[diff] [blame]	145	#' corpusQuery("Ameisenplage",
				146	#' fields = c("textSigle", "author", "title", "pubDate", "pubPlace", "textType"))
				147	#' results <- fetchAll(query)
				148	#' results@collectedMatches
Marc Kupietz	657d8e7	2020-02-25 18:31:50 +0100	[diff] [blame]	149	#' }
Marc Kupietz	3c531f6	2019-09-13 12:17:24 +0200	[diff] [blame]	150	#'
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	151	#' \dontrun{
				152	#'
Marc Kupietz	603491f	2019-09-18 14:01:02 +0200	[diff] [blame]	153	#' # Use the copy of a KorAP-web-frontend URL for an API query of "Ameise" in a virtual corpus
				154	#' # and show the number of query hits (but don't fetch them).
Marc Kupietz	69cc54a	2019-09-30 12:06:54 +0200	[diff] [blame]	155	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	156	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	157	#' corpusQuery(
				158	#' KorAPUrl =
				159	#' "https://korap.ids-mannheim.de/?q=Ameise&cq=pubDate+since+2017&ql=poliqarp"
				160	#' )
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	161	#' }
				162	#'
				163	#' \dontrun{
Marc Kupietz	3c531f6	2019-09-13 12:17:24 +0200	[diff] [blame]	164	#'
Marc Kupietz	603491f	2019-09-18 14:01:02 +0200	[diff] [blame]	165	#' # Plot the time/frequency curve of "Ameisenplage"
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	166	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	167	#' {
				168	#' . ->> kco
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	169	#' } \|>
				170	#' corpusQuery("Ameisenplage") \|>
				171	#' fetchAll() \|>
				172	#' slot("collectedMatches") \|>
				173	#' mutate(year = lubridate::year(pubDate)) \|>
				174	#' dplyr::select(year) \|>
				175	#' group_by(year) \|>
				176	#' summarise(Count = dplyr::n()) \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	177	#' mutate(Freq = mapply(function(f, y) {
				178	#' f / corpusStats(kco, paste("pubDate in", y))@tokens
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	179	#' }, Count, year)) \|>
				180	#' dplyr::select(-Count) \|>
				181	#' complete(year = min(year):max(year), fill = list(Freq = 0)) \|>
Marc Kupietz	69cc54a	2019-09-30 12:06:54 +0200	[diff] [blame]	182	#' plot(type = "l")
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	183	#' }
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	184	#' @seealso [KorAPConnection()], [fetchNext()], [fetchRest()], [fetchAll()], [corpusStats()]
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	185	#'
				186	#' @references
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	187	#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	188	#'
				189	#' @export
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	190	setMethod(
				191	"corpusQuery", "KorAPConnection",
				192	function(kco,
				193	query = if (missing(KorAPUrl)) {
				194	stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
				195	} else {
				196	httr2::url_parse(KorAPUrl)$query$q
				197	},
				198	vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
				199	KorAPUrl,
				200	metadataOnly = TRUE,
				201	ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql,
				202	fields = c(
				203	"corpusSigle",
				204	"textSigle",
				205	"pubDate",
				206	"pubPlace",
				207	"availability",
				208	"textClass",
				209	"snippet",
				210	"tokens"
				211	),
				212	accessRewriteFatal = TRUE,
				213	verbose = kco@verbose,
				214	expand = length(vc) != length(query),
				215	as.df = FALSE,
				216	context = NULL) {
				217	if (length(query) > 1 \|\| length(vc) > 1) {
				218	grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	219
				220	# Initialize timing variables for ETA calculation
				221	total_queries <- nrow(grid)
				222	current_query <- 0
				223	start_time <- Sys.time()
				224
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	225	results <- purrr::pmap(grid, function(query, vc, ...) {
				226	current_query <<- current_query + 1
				227
				228	# Execute the single query directly (avoiding recursive call)
				229	contentFields <- c("snippet", "tokens")
				230	query_fields <- fields
				231	if (metadataOnly) {
				232	query_fields <- query_fields[!query_fields %in% contentFields]
				233	}
				234	if (!"textSigle" %in% query_fields) {
				235	query_fields <- c(query_fields, "textSigle")
				236	}
				237	request <-
				238	paste0(
				239	"?q=",
				240	url_encode(enc2utf8(query)),
				241	ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
				242	ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
				243	ifelse(!metadataOnly, "&show-tokens=true", ""),
				244	"&ql=", ql
				245	)
				246	webUIRequestUrl <- paste0(kco@KorAPUrl, request)
				247	requestUrl <- paste0(
				248	kco@apiUrl,
				249	"search",
				250	request,
				251	"&fields=",
				252	paste(query_fields, collapse = ","),
				253	if (metadataOnly) "&access-rewrite-disabled=true" else ""
				254	)
				255
				256	# Show individual query progress
				257	log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
				258	res <- apiCall(kco, paste0(requestUrl, "&count=0"))
				259	if (is.null(res)) {
				260	log_info(verbose, ": API call failed\n")
				261	totalResults <- 0
				262	} else {
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame^]	263	# Check for query rewrites and warn the user
				264	warnOnRewrites(res)
				265
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	266	totalResults <- as.integer(res$meta$totalResults)
				267	log_info(verbose, ": ", totalResults, " hits")
				268	if (!is.null(res$meta$cached)) {
				269	log_info(verbose, " [cached]")
				270	} else if (!is.null(res$meta$benchmark)) {
				271	if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
				272	time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
				273	formatted_time <- paste0(round(time_value, 2), "s")
				274	log_info(verbose, ", took ", formatted_time)
				275	} else {
				276	log_info(verbose, ", took ", res$meta$benchmark)
				277	}
				278	}
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	279
				280	# Calculate and display ETA information on the same line if verbose and we have more than one query
				281	if (verbose && total_queries > 1) {
				282	eta_info <- calculate_eta(current_query, total_queries, start_time)
				283	if (eta_info != "") {
				284	elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
				285	avg_time_per_query <- elapsed_time / current_query
				286
				287	# Add ETA info to the same line - remove the leading ". " for cleaner formatting
				288	clean_eta_info <- sub("^\\. ", ". ", eta_info)
				289	log_info(verbose, clean_eta_info)
				290	}
				291	}
				292
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	293	log_info(verbose, "\n")
				294	}
				295
				296	result <- data.frame(
				297	query = query,
				298	totalResults = totalResults,
				299	vc = vc,
				300	webUIRequestUrl = webUIRequestUrl,
				301	stringsAsFactors = FALSE
				302	)
				303
Marc Kupietz	6ef61a8	2025-05-29 16:07:03 +0200	[diff] [blame]	304	return(result)
				305	})
				306
				307	results %>% bind_rows()
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	308	} else {
Marc Kupietz	2078bde	2023-08-27 16:46:15 +0200	[diff] [blame]	309	contentFields <- c("snippet", "tokens")
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	310	if (metadataOnly) {
				311	fields <- fields[!fields %in% contentFields]
				312	}
Marc Kupietz	80dc643	2025-02-07 16:57:40 +0100	[diff] [blame]	313	if (!"textSigle" %in% fields) {
				314	fields <- c(fields, "textSigle")
				315	}
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	316	request <-
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	317	paste0(
				318	"?q=",
				319	url_encode(enc2utf8(query)),
				320	ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
				321	ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
				322	ifelse(!metadataOnly, "&show-tokens=true", ""),
				323	"&ql=", ql
				324	)
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	325	webUIRequestUrl <- paste0(kco@KorAPUrl, request)
				326	requestUrl <- paste0(
				327	kco@apiUrl,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	328	"search",
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	329	request,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	330	"&fields=",
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	331	paste(fields, collapse = ","),
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	332	if (metadataOnly) "&access-rewrite-disabled=true" else ""
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	333	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	334	log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"",
				335	sep =
				336	""
				337	)
				338	res <- apiCall(kco, paste0(requestUrl, "&count=0"))
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	339	if (is.null(res)) {
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	340	message("API call failed.")
				341	totalResults <- 0
				342	} else {
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame^]	343	# Check for query rewrites and warn the user
				344	warnOnRewrites(res)
				345
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	346	totalResults <- as.integer(res$meta$totalResults)
Marc Kupietz	a47d150	2023-04-18 15:26:47 +0200	[diff] [blame]	347	log_info(verbose, ": ", totalResults, " hits")
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	348	if (!is.null(res$meta$cached)) {
Marc Kupietz	a47d150	2023-04-18 15:26:47 +0200	[diff] [blame]	349	log_info(verbose, " [cached]\n")
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	350	} else if (!is.null(res$meta$benchmark)) {
Marc Kupietz	2baf5c5	2025-09-05 16:41:11 +0200	[diff] [blame]	351	# Round the benchmark time to 2 decimal places for better readability.
				352	# Be robust to locales using comma as decimal separator (e.g., "0,12s").
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	353	if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
Marc Kupietz	2baf5c5	2025-09-05 16:41:11 +0200	[diff] [blame]	354	bench_str <- sub("s$", "", res$meta$benchmark)
				355	bench_num <- suppressWarnings(as.numeric(gsub(",", ".", bench_str)))
				356	if (!is.na(bench_num)) {
				357	formatted_time <- paste0(round(bench_num, 2), "s")
				358	} else {
				359	formatted_time <- res$meta$benchmark
				360	}
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	361	log_info(verbose, ", took ", formatted_time, "\n", sep = "")
				362	} else {
				363	# Fallback if the format is different than expected
				364	log_info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
				365	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	366	} else {
				367	log_info(verbose, "\n")
				368	}
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	369	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	370	if (as.df) {
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	371	data.frame(
				372	query = query,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	373	totalResults = totalResults,
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	374	vc = vc,
				375	webUIRequestUrl = webUIRequestUrl,
				376	stringsAsFactors = FALSE
				377	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	378	} else {
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	379	KorAPQuery(
				380	korapConnection = kco,
				381	nextStartIndex = 0,
				382	fields = fields,
				383	requestUrl = requestUrl,
				384	request = request,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	385	totalResults = totalResults,
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	386	vc = vc,
				387	apiResponse = res,
				388	webUIRequestUrl = webUIRequestUrl,
Marc Kupietz	a467572	2022-02-23 23:55:15 +0100	[diff] [blame]	389	hasMoreMatches = (totalResults > 0),
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	390	)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	391	}
Marc Kupietz	a96537f	2019-11-09 23:07:44 +0100	[diff] [blame]	392	}
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	393	}
				394	)
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	395
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame^]	396	# Helper function to check if a query rewrite warning should be shown
				397	warnOnRewrites <- function(res) {
				398	if (!is.null(res$collection$rewrites)) {
				399	comment <- res$collection$rewrites$`_comment`
				400	# Only show warning if it's not just the standard policy message
				401	if (!is.null(comment) && comment != "All corpus access policy has been added.") {
				402	warning(res$collection$rewrites$editor, " had to rewrite your query: ", comment)
				403	}
				404	}
				405	}
				406
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	407	#' @importFrom purrr map
				408	repair_data_strcuture <- function(x) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	409	if (is.list(x)) {
				410	as.character(purrr::map(x, ~ if (length(.x) > 1) {
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	411	paste(.x, collapse = " ")
				412	} else {
				413	.x
				414	}))
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	415	} else {
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	416	ifelse(is.na(x), "", x)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	417	}
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	418	}
				419
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	420	#' Fetch the next bunch of results of a KorAP query.
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	421	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	422	#' `fetchNext` fetches the next bunch of results of a KorAP query.
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	423	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	424	#' @family corpus search functions
				425	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	426	#' @param kqo object obtained from [corpusQuery()]
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	427	#' @param offset start offset for query results to fetch
				428	#' @param maxFetch maximum number of query results to fetch
Marc Kupietz	25aebc3	2019-09-16 18:40:50 +0200	[diff] [blame]	429	#' @param verbose print progress information if true
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	430	#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use [set.seed()] to set seed for reproducible results.
				431	#' @return The `kqo` input object with updated slots `collectedMatches`, `apiResponse`, `nextStartIndex`, `hasMoreMatches`
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	432	#'
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	433	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	434	#' \dontrun{
				435	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	436	#' q <- KorAPConnection() \|>
				437	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	438	#' fetchNext()
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	439	#' q@collectedMatches
Marc Kupietz	657d8e7	2020-02-25 18:31:50 +0100	[diff] [blame]	440	#' }
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	441	#'
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	442	#' @references
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	443	#' <https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026>
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	444	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	445	#' @aliases fetchNext
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	446	#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	447	#' @importFrom tibble enframe add_column
				448	#' @importFrom stringr word
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	449	#' @importFrom tidyr unnest unchop pivot_wider
				450	#' @importFrom purrr map
Marc Kupietz	632cbd4	2019-09-06 16:04:51 +0200	[diff] [blame]	451	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	452	setMethod("fetchNext", "KorAPQuery", function(kqo,
				453	offset = kqo@nextStartIndex,
				454	maxFetch = maxResultsPerPage,
				455	verbose = kqo@korapConnection@verbose,
				456	randomizePageOrder = FALSE) {
Marc Kupietz	a7a8f1b	2024-12-18 15:56:19 +0100	[diff] [blame]	457	# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	458	results <- key <- name <- tmp_positions <- 0
Marc Kupietz	a7a8f1b	2024-12-18 15:56:19 +0100	[diff] [blame]	459
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	460	if (kqo@totalResults == 0 \|\| offset >= kqo@totalResults) {
				461	return(kqo)
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	462	}
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	463	use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	464	# Calculate the initial page number (not used directly - keeping for reference)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	465	collectedMatches <- kqo@collectedMatches
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	466
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	467	# Track start time for ETA calculation
				468	start_time <- Sys.time()
				469
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	470	# For randomized page order, generate a list of randomized page indices
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	471	if (randomizePageOrder) {
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	472	# Calculate how many pages we need to fetch based on maxFetch
				473	total_pages_to_fetch <- if (!is.na(maxFetch)) {
				474	# Either limited by maxFetch or total results, whichever is smaller
				475	min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
				476	} else {
				477	# All pages
				478	ceiling(kqo@totalResults / maxResultsPerPage)
				479	}
				480
				481	# Generate randomized page indices (0-based for API)
				482	pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
				483	page_index <- 1 # Index to track which page in the randomized list we're on
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	484	}
				485
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	486	if (is.null(collectedMatches)) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	487	collectedMatches <- data.frame()
				488	}
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	489
				490	# Initialize the page counter properly based on nextStartIndex and any previously fetched results
				491	# We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
				492	# For first call, this will be 1, for subsequent calls, it will reflect our actual position
				493	current_page_number <- ceiling(offset / maxResultsPerPage) + 1
				494
				495	# For sequential fetches, keep track of which global page we're on
				496	# This is important for correctly showing page numbers in subsequent fetchNext calls
				497	page_count_start <- current_page_number
				498
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	499	repeat {
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	500	# Determine which page to fetch next
				501	if (randomizePageOrder) {
				502	# In randomized mode, get the page from our randomized list using the page_index
				503	# Make sure we don't exceed the array bounds
				504	if (page_index > length(pages)) {
				505	break # No more pages to fetch in randomized mode
				506	}
				507	current_offset_page <- pages[page_index]
				508	# For display purposes in randomized mode, show which page out of the total we're fetching
				509	display_page_number <- page_index
				510	} else {
				511	# In sequential mode, use the current_page_number to calculate the offset
				512	current_offset_page <- (current_page_number - 1)
				513	display_page_number <- current_page_number
				514	}
				515
				516	# Calculate the actual offset in tokens
				517	currentOffset <- current_offset_page * maxResultsPerPage
				518
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	519	# Build the query with the appropriate count and offset using httr2
				520	count_param <- min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage)
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	521
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	522	# Parse existing URL to preserve all query parameters
				523	parsed_url <- httr2::url_parse(kqo@requestUrl)
				524	existing_query <- parsed_url$query
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	525
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	526	# Add/update count and offset parameters
				527	existing_query$count <- count_param
				528	existing_query$offset <- currentOffset
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	529
Marc Kupietz	ef0e939	2025-06-18 12:21:49 +0200	[diff] [blame]	530	# Rebuild the URL with all parameters
				531	query <- httr2::url_modify(kqo@requestUrl, query = existing_query)
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame^]	532
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	533	res <- apiCall(kqo@korapConnection, query)
				534	if (length(res$matches) == 0) {
				535	break
				536	}
				537
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame^]	538	# Check for query rewrites and warn the user
				539	warnOnRewrites(res)
				540
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	541	if ("fields" %in% colnames(res$matches) && (is.na(use_korap_api) \|\| as.numeric(use_korap_api) >= 1.0)) {
Marc Kupietz	16ccf11	2025-01-26 13:25:27 +0100	[diff] [blame]	542	log_info(verbose, "Using fields API: ")
Marc Kupietz	05a6079	2024-12-07 16:23:31 +0100	[diff] [blame]	543	currentMatches <- res$matches$fields %>%
				544	purrr::map(~ mutate(.x, value = repair_data_strcuture(value))) %>%
				545	tibble::enframe() %>%
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	546	tidyr::unnest(cols = value) %>%
				547	tidyr::pivot_wider(names_from = key, id_cols = name, names_repair = "unique") %>%
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	548	dplyr::select(-name)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	549	if ("snippet" %in% colnames(res$matches)) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	550	currentMatches$snippet <- res$matches$snippet
				551	}
Marc Kupietz	3cd2c6c	2025-01-08 20:35:39 +0100	[diff] [blame]	552	if ("tokens" %in% colnames(res$matches)) {
				553	currentMatches$tokens <- res$matches$tokens
				554	}
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	555	} else {
				556	currentMatches <- res$matches
				557	}
				558
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	559	for (field in kqo@fields) {
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	560	if (!field %in% colnames(currentMatches)) {
				561	currentMatches[, field] <- NA
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	562	}
				563	}
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	564	currentMatches <- currentMatches %>%
				565	select(kqo@fields) %>%
				566	mutate(
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	567	matchID = res$matches$matchID,
Marc Kupietz	0447da0	2025-01-08 20:51:09 +0100	[diff] [blame]	568	tmp_positions = gsub(".-p(\\d+)-(\\d+).", "\\1 \\2", res$matches$matchID),
Marc Kupietz	f488112	2024-12-17 14:55:39 +0100	[diff] [blame]	569	matchStart = as.integer(stringr::word(tmp_positions, 1)),
				570	matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
				571	) %>%
				572	select(-tmp_positions)
				573
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	574	if (!is.list(collectedMatches)) {
				575	collectedMatches <- currentMatches
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	576	} else {
Marc Kupietz	2078bde	2023-08-27 16:46:15 +0200	[diff] [blame]	577	collectedMatches <- bind_rows(collectedMatches, currentMatches)
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	578	}
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	579
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame^]	580
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	581	# Get the actual items per page from the API response
				582	# We now consistently use maxResultsPerPage instead
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	583
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	584	# Calculate total pages consistently using fixed maxResultsPerPage
				585	# This ensures consistent page counting across the function
				586	total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
				587
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	588	# Calculate ETA using the centralized function from logging.R
				589	current_page <- if (randomizePageOrder) page_index else display_page_number
				590	total_pages_to_fetch <- if (!is.na(maxFetch)) {
				591	# Account for offset - we can only fetch from the remaining results after offset
				592	remaining_results_after_offset <- max(0, kqo@totalResults - offset)
				593	min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
				594	} else {
				595	total_pages
				596	}
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	597
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	598	eta_info <- calculate_eta(current_page, total_pages_to_fetch, start_time)
Marc Kupietz	365660e	2025-06-25 15:09:55 +0200	[diff] [blame]	599
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	600	# Extract timing information for display
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	601	time_per_page <- NA
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	602	if (!is.null(res$meta$benchmark) && is.character(res$meta$benchmark)) {
Marc Kupietz	ae9b617	2025-05-02 15:50:01 +0200	[diff] [blame]	603	time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	604	}
				605
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	606	# Create the page display string with proper formatting
Marc Kupietz	acbaab0	2025-05-01 10:56:35 +0200	[diff] [blame]	607
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	608	# For global page tracking, calculate the absolute page number
				609	actual_display_number <- if (randomizePageOrder) {
				610	current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
				611	} else {
				612	# In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
				613	current_offset_page + 1
				614	}
				615
				616	# For subsequent calls to fetchNext, we need to calculate the correct page numbers
				617	# based on the current batch being fetched
				618
				619	# For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
				620	# Simply count from 1 within the current batch
				621
				622	# The relative page number is simply the current position in this batch
				623	if (randomizePageOrder) {
				624	relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
				625	} else {
				626	relative_page_number <- display_page_number - (page_count_start - 1)
				627	}
				628
				629	# How many pages will we fetch in this batch?
Marc Kupietz	021663d	2025-06-18 17:49:22 +0200	[diff] [blame]	630	# If maxFetch is specified, calculate the total pages for this fetch operation
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	631	pages_in_this_batch <- if (!is.na(maxFetch)) {
Marc Kupietz	021663d	2025-06-18 17:49:22 +0200	[diff] [blame]	632	# Account for offset - we can only fetch from the remaining results after offset
				633	remaining_results_after_offset <- max(0, kqo@totalResults - offset)
				634	min(ceiling(maxFetch / maxResultsPerPage), ceiling(remaining_results_after_offset / maxResultsPerPage))
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	635	} else {
				636	# Otherwise fetch all remaining pages
				637	total_pages - page_count_start + 1
				638	}
				639
				640	# The total pages to be shown in this batch
				641	batch_total_pages <- pages_in_this_batch
				642
				643	page_display <- paste0(
				644	"Retrieved page ",
				645	sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
				646	"/",
				647	sprintf("%d", batch_total_pages)
				648	)
				649
				650	# If randomized, also show which actual page we fetched
				651	if (randomizePageOrder) {
				652	# Determine the maximum width needed for page numbers (based on total pages)
				653	# This ensures consistent alignment
				654	max_page_width <- nchar(as.character(total_pages))
				655	# Add the actual page number that was fetched (0-based + 1 for display) with proper padding
Marc Kupietz	7638ca4	2025-05-25 13:18:16 +0200	[diff] [blame]	656	page_display <- paste0(
				657	page_display,
				658	sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1)
				659	)
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	660	}
				661	# Always show the absolute page number and total pages (for clarity)
				662	else {
				663	# Show the absolute page number (out of total possible pages)
				664	page_display <- paste0(page_display, sprintf(
				665	" (page %d of %d total)",
				666	actual_display_number, total_pages
				667	))
				668	}
				669
				670	# Add caching or timing information
				671	if (!is.null(res$meta$cached)) {
				672	page_display <- paste0(page_display, " [cached]")
				673	} else {
				674	page_display <- paste0(
				675	page_display,
				676	" in ",
				677	if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
Marc Kupietz	24799fd	2025-06-25 14:15:36 +0200	[diff] [blame]	678	"s",
				679	eta_info
Marc Kupietz	623d712	2025-05-25 12:46:12 +0200	[diff] [blame]	680	)
				681	}
				682
				683	log_info(verbose, paste0(page_display, "\n"))
				684
				685	# Increment the appropriate counter based on mode
				686	if (randomizePageOrder) {
				687	page_index <- page_index + 1
				688	} else {
				689	current_page_number <- current_page_number + 1
				690	}
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	691	results <- results + res$meta$itemsPerPage
Marc Kupietz	e8bd49b	2024-06-28 07:24:44 +0200	[diff] [blame]	692	if (nrow(collectedMatches) >= kqo@totalResults \|\| (!is.na(maxFetch) && results >= maxFetch)) {
Marc Kupietz	5bbc9db	2019-08-30 16:30:45 +0200	[diff] [blame]	693	break
				694	}
				695	}
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	696	nextStartIndex <- min(res$meta$startIndex + res$meta$itemsPerPage, kqo@totalResults)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	697	KorAPQuery(
				698	nextStartIndex = nextStartIndex,
Marc Kupietz	d0d3e9b	2019-09-24 17:36:03 +0200	[diff] [blame]	699	korapConnection = kqo@korapConnection,
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	700	fields = kqo@fields,
				701	requestUrl = kqo@requestUrl,
				702	request = kqo@request,
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	703	totalResults = kqo@totalResults,
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	704	vc = kqo@vc,
				705	webUIRequestUrl = kqo@webUIRequestUrl,
Marc Kupietz	6817095	2021-06-30 09:37:21 +0200	[diff] [blame]	706	hasMoreMatches = (kqo@totalResults > nextStartIndex),
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	707	apiResponse = res,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	708	collectedMatches = collectedMatches
				709	)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	710	})
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	711
				712	#' Fetch all results of a KorAP query.
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	713	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	714	#' `fetchAll` fetches all results of a KorAP query.
Marc Kupietz	a6e4ee6	2021-03-05 09:00:15 +0100	[diff] [blame]	715	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	716	#' @family corpus search functions
Marc Kupietz	dc880ac	2025-06-24 20:34:43 +0200	[diff] [blame]	717	#' @param kqo object obtained from [corpusQuery()]
				718	#' @param verbose print progress information if true
				719	#' @param ... further arguments passed to [fetchNext()]
				720	#' @return The updated `kqo` object with all results in `@collectedMatches`
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	721	#'
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	722	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	723	#' \dontrun{
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	724	#' # Fetch all metadata of every query hit for "Ameisenplage" and show a summary
				725	#' q <- KorAPConnection() \|>
				726	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	727	#' fetchAll()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	728	#' q@collectedMatches
Marc Kupietz	ecc8670	2025-06-24 12:12:51 +0200	[diff] [blame]	729	#'
				730	#' # Fetch also all KWICs
				731	#' q <- KorAPConnection() \|> auth() \|>
				732	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
				733	#' fetchAll()
				734	#' q@collectedMatches
				735	#'
				736	#' # Retrieve title and text sigle metadata of all texts published on 1958-03-12
				737	#' q <- KorAPConnection() \|>
				738	#' corpusQuery("<base/s=t>", # this matches each text once
				739	#' vc = "pubDate in 1958-03-12",
				740	#' fields = c("textSigle", "title"),
				741	#' ) \|>
				742	#' fetchAll()
				743	#' q@collectedMatches
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	744	#' }
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	745	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	746	#' @aliases fetchAll
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	747	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	748	setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
				749	return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	750	})
				751
				752	#' Fetches the remaining results of a KorAP query.
				753	#'
Marc Kupietz	dc880ac	2025-06-24 20:34:43 +0200	[diff] [blame]	754	#' @param kqo object obtained from [corpusQuery()]
				755	#' @param verbose print progress information if true
				756	#' @param ... further arguments passed to [fetchNext()]
				757	#' @return The updated `kqo` object with remaining results in `@collectedMatches`
				758	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	759	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	760	#' \dontrun{
				761	#'
Marc Kupietz	d352642	2025-06-25 09:16:15 +0200	[diff] [blame]	762	#' q <- KorAPConnection() \|>
				763	#' corpusQuery("Ameisenplage") \|>
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	764	#' fetchRest()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	765	#' q@collectedMatches
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	766	#' }
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	767	#'
				768	#' @aliases fetchRest
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	769	#' @export
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	770	setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
				771	return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	772	})
				773
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	774	#'
				775	#' Parse XML annotations into linguistic layers
				776	#'
				777	#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
				778	#' from XML annotation snippets returned by the KorAP API.
				779	#'
				780	#' @param xml_snippet XML string containing annotation data
				781	#' @return Named list with vectors for 'token', 'lemma', 'pos', and 'morph'
				782	#' @keywords internal
				783	parse_xml_annotations <- function(xml_snippet) {
				784	if (is.null(xml_snippet) \|\| is.na(xml_snippet) \|\| xml_snippet == "") {
				785	return(list(token = character(0), lemma = character(0), pos = character(0), morph = character(0)))
				786	}
				787
				788	# Extract content within <span class="match">...</span> using a more robust approach
				789	if (grepl('<span class="match">', xml_snippet)) {
				790	# Find the start of match span
				791	start_pos <- regexpr('<span class="match">', xml_snippet)
				792	if (start_pos > 0) {
				793	# Find the end by counting nested spans
				794	content_start <- start_pos + attr(start_pos, "match.length")
				795	remaining <- substr(xml_snippet, content_start, nchar(xml_snippet))
				796
				797	# Simple approach: extract everything until we hit context-right or end
				798	if (grepl('<span class="context-right">', remaining)) {
				799	content_to_parse <- gsub('(.?)<span class="context-right">.', '\\1', remaining)
				800	} else {
				801	# Find the closing </span> that matches our opening span
				802	# For now, use a simpler approach - take everything until the last </span> sequence
				803	content_to_parse <- gsub('(.)</span>\\s$', '\\1', remaining)
				804	}
				805	} else {
				806	content_to_parse <- xml_snippet
				807	}
				808	} else {
				809	content_to_parse <- xml_snippet
				810	}
				811
				812	# Initialize result vectors
				813	tokens <- character(0)
				814	lemmas <- character(0)
				815	pos_tags <- character(0)
				816	morph_tags <- character(0)
				817
				818	# Split the content by </span> and process each meaningful part
				819	parts <- unlist(strsplit(content_to_parse, '</span>'))
				820
				821	for (part in parts) {
				822	part <- trimws(part)
				823	if (nchar(part) == 0) next
				824
				825	# Look for parts that have title attributes and end with text
				826	if (grepl('<span[^>]*title=', part)) {
				827	# Extract the text content (everything after the last >)
				828	text_content <- gsub('.>([^<])$', '\\1', part)
				829	text_content <- trimws(text_content)
				830
				831	if (nchar(text_content) > 0 && !grepl('^<', text_content)) {
				832	tokens <- c(tokens, text_content)
				833
				834	# Extract all title attributes from this part
				835	title_pattern <- 'title="([^"]*)"'
				836	title_matches <- gregexpr(title_pattern, part)
				837
				838	lemma <- NA
				839	pos_tag <- NA
				840	morph_tag <- NA
				841
				842	if (title_matches[[1]][1] != -1) {
				843	all_titles <- regmatches(part, title_matches)[[1]]
				844	for (title_match in all_titles) {
				845	title_content <- gsub(title_pattern, '\\1', title_match)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	846
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	847	# Split by spaces and process each annotation
				848	annotations <- unlist(strsplit(title_content, "\\s+"))
				849	for (annotation in annotations) {
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	850	if (grepl('^[^/]+/l:', annotation)) {
				851	lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
				852	} else if (grepl('^[^/]+/p:', annotation)) {
				853	pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
				854	} else if (grepl('^[^/]+/m:', annotation)) {
				855	morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	856	}
				857	}
				858	}
				859	}
				860
				861	lemmas <- c(lemmas, lemma)
				862	pos_tags <- c(pos_tags, pos_tag)
				863	morph_tags <- c(morph_tags, morph_tag)
				864	}
				865	}
				866	}
				867
				868	# If no tokens found with the splitting approach, try a different method
				869	if (length(tokens) == 0) {
				870	# Look for the innermost spans that contain actual text
				871	innermost_pattern <- '<span[^>]title="([^"])"[^>]*>([^<]+)</span>'
				872	innermost_matches <- gregexpr(innermost_pattern, content_to_parse, perl = TRUE)
				873
				874	if (innermost_matches[[1]][1] != -1) {
				875	matches <- regmatches(content_to_parse, innermost_matches)[[1]]
				876
				877	for (match in matches) {
				878	title <- gsub(innermost_pattern, '\\1', match, perl = TRUE)
				879	text <- gsub(innermost_pattern, '\\2', match, perl = TRUE)
				880	text <- trimws(text)
				881
				882	if (nchar(text) > 0) {
				883	tokens <- c(tokens, text)
				884
				885	# Parse space-separated annotations in title
				886	lemma <- NA
				887	pos_tag <- NA
				888	morph_tag <- NA
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	889
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	890	annotations <- unlist(strsplit(title, "\\s+"))
				891	for (annotation in annotations) {
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	892	if (grepl('^[^/]+/l:', annotation)) {
				893	lemma <- gsub('^[^/]+/l:(.*)$', '\\1', annotation)
				894	} else if (grepl('^[^/]+/p:', annotation)) {
				895	pos_tag <- gsub('^[^/]+/p:(.*)$', '\\1', annotation)
				896	} else if (grepl('^[^/]+/m:', annotation)) {
				897	morph_tag <- gsub('^[^/]+/m:(.*)$', '\\1', annotation)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	898	}
				899	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	900
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	901	lemmas <- c(lemmas, lemma)
				902	pos_tags <- c(pos_tags, pos_tag)
				903	morph_tags <- c(morph_tags, morph_tag)
				904	}
				905	}
				906	}
				907	}
				908
				909	# Ensure all vectors have the same length
				910	max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
				911	if (max_length > 0) {
				912	tokens <- c(tokens, rep(NA, max_length - length(tokens)))
				913	lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
				914	pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
				915	morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
				916	}
				917
				918	return(list(
				919	token = tokens,
				920	lemma = lemmas,
				921	pos = pos_tags,
				922	morph = morph_tags
				923	))
				924	}
				925
				926	#'
				927	#' Parse XML annotations into linguistic layers with left/match/right structure
				928	#'
				929	#' Internal helper function to extract linguistic annotations (lemma, POS, morphology)
				930	#' from XML annotation snippets returned by the KorAP API, split into left context,
				931	#' match, and right context sections like the tokens field.
				932	#'
				933	#' @param xml_snippet XML string containing annotation data
				934	#' @return Named list with nested structure containing left/match/right for 'atokens', 'lemma', 'pos', and 'morph'
				935	#' @keywords internal
				936	parse_xml_annotations_structured <- function(xml_snippet) {
				937	if (is.null(xml_snippet) \|\| is.na(xml_snippet) \|\| xml_snippet == "") {
				938	empty_result <- list(left = character(0), match = character(0), right = character(0))
				939	return(list(
				940	atokens = empty_result,
				941	lemma = empty_result,
				942	pos = empty_result,
				943	morph = empty_result
				944	))
				945	}
				946
				947	# Helper function to extract annotations from a span section
				948	extract_annotations_from_section <- function(section_content) {
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	949	# Remove any <mark>...</mark> tags that may interrupt token boundaries
				950	section_no_marks <- gsub('</?mark[^>]*>', '', section_content, perl = TRUE)
				951	# Normalize separators between adjacent top-level spans so splitting is robust.
				952	# Replace any punctuation/entity/space run between one-or-more closing spans and the next opening span
				953	# with a single space, preserving all closing spans.
				954	section_norm <- gsub('((?:</span>)+)[[:space:]](?:&[^;]+;\|[[:punct:]]\|[[:space:]])[[:space:]]*(<span)', '\\1 \\2', section_no_marks, perl = TRUE)
				955	# Handle both spaced tokens and nested single tokens by scanning innermost spans with direct text
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	956	tokens <- character(0)
				957	lemmas <- character(0)
				958	pos_tags <- character(0)
				959	morph_tags <- character(0)
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	960
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	961	pat_token <- '<span[^>]title="([^"])"[^>]*>([^<]+)</span>'
				962	mm <- gregexpr(pat_token, section_norm, perl = TRUE)
				963	if (mm[[1]][1] != -1) {
				964	starts <- mm[[1]]
				965	lens <- attr(mm[[1]], 'match.length')
				966	for (k in seq_along(starts)) {
				967	s <- starts[k]
				968	e <- s + lens[k] - 1
				969	fragment <- substr(section_norm, s, e)
				970	text_content <- sub(pat_token, '\\2', fragment, perl = TRUE)
				971	text_content <- trimws(text_content)
				972	title_content <- sub(pat_token, '\\1', fragment, perl = TRUE)
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	973
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	974	if (nchar(text_content) == 0) next
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	975
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	976	lemma <- NA
				977	pos_tag <- NA
				978	morph_features <- character(0)
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	979
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	980	# parse inner title
				981	ann <- unlist(strsplit(title_content, "[[:space:]]+"))
				982	for (a in ann) {
				983	if (grepl('/l:', a)) {
				984	lemma <- sub('.?/l:(.)$', '\\1', a, perl = TRUE)
				985	} else if (grepl('/p:', a)) {
				986	pos_tag <- sub('.?/p:(.)$', '\\1', a, perl = TRUE)
				987	} else if (grepl('/m:', a)) {
				988	morph_features <- c(morph_features, sub('.?/m:(.)$', '\\1', a, perl = TRUE))
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	989	}
				990	}
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	991
				992	# If lemma missing, look back in nearby context for the nearest title containing l:
				993	if (is.na(lemma) \|\| nchar(lemma) == 0) {
				994	ctx_start <- max(1, s - 500)
				995	context <- substr(section_norm, ctx_start, s - 1)
				996	tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
				997	if (tmm[[1]][1] != -1) {
				998	ctx_titles <- regmatches(context, tmm)[[1]]
				999	for (ti in rev(ctx_titles)) {
				1000	cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
				1001	if (grepl('/l:', cont)) {
				1002	lemma <- sub('.?/l:([^ ]+).', '\\1', cont, perl = TRUE)
				1003	break
				1004	}
				1005	}
				1006	}
				1007	}
				1008
				1009	# If POS missing, keep NA; morphological features may also appear in outer titles
				1010	if (length(morph_features) == 0) {
				1011	ctx_start <- max(1, s - 500)
				1012	context <- substr(section_norm, ctx_start, s - 1)
				1013	tmm <- gregexpr('title="([^"]*)"', context, perl = TRUE)
				1014	if (tmm[[1]][1] != -1) {
				1015	ctx_titles <- regmatches(context, tmm)[[1]]
				1016	for (ti in rev(ctx_titles)) {
				1017	cont <- sub('title="([^"]*)"', '\\1', ti, perl = TRUE)
				1018	if (grepl('/m:', cont)) {
				1019	mparts <- unlist(strsplit(cont, "[[:space:]]+"))
				1020	for (mp in mparts) if (grepl('/m:', mp)) morph_features <- c(morph_features, sub('.?/m:(.)$', '\\1', mp, perl = TRUE))
				1021	break
				1022	}
				1023	}
				1024	}
				1025	}
				1026
				1027	tokens <- c(tokens, text_content)
				1028	lemmas <- c(lemmas, if (!is.null(lemma)) lemma else NA)
				1029	pos_tags <- c(pos_tags, if (!is.null(pos_tag)) pos_tag else NA)
				1030	morph_tags <- c(morph_tags, if (length(morph_features) > 0) paste(morph_features, collapse = "\|") else NA)
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1031	}
				1032	}
				1033
				1034	# Ensure all vectors have the same length
				1035	max_length <- max(length(tokens), length(lemmas), length(pos_tags), length(morph_tags))
				1036	if (max_length > 0) {
				1037	tokens <- c(tokens, rep(NA, max_length - length(tokens)))
				1038	lemmas <- c(lemmas, rep(NA, max_length - length(lemmas)))
				1039	pos_tags <- c(pos_tags, rep(NA, max_length - length(pos_tags)))
				1040	morph_tags <- c(morph_tags, rep(NA, max_length - length(morph_tags)))
				1041	}
				1042
				1043	return(list(
				1044	tokens = tokens,
				1045	lemmas = lemmas,
				1046	pos_tags = pos_tags,
				1047	morph_tags = morph_tags
				1048	))
				1049	}
				1050
				1051	# Split the XML into three parts: left context, match content, and right context
				1052	# The structure is: <span class="match">...left...<mark>...match...</mark>...right...</span>
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1053
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1054	# First extract the content within the match span using DOTALL modifier
				1055	match_span_pattern <- '(?s)<span class="match">(.?)</span>\\s<span class="context-right">'
				1056	match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1057
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1058	if (match_span_match == -1) {
				1059	# Try alternative pattern if no context-right
				1060	match_span_pattern <- '(?s)<span class="match">(.?)</span>\\s$'
				1061	match_span_match <- regexpr(match_span_pattern, xml_snippet, perl = TRUE)
				1062	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1063
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1064	if (match_span_match > 0) {
				1065	match_span_content <- gsub(match_span_pattern, '\\1', xml_snippet, perl = TRUE)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1066
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1067	# Now find the <mark> and </mark> positions within this content
				1068	mark_start <- regexpr('<mark[^>]*>', match_span_content, perl = TRUE)
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1069	# Use the LAST closing </mark> to cover multi-part matches
				1070	mark_end_gre <- gregexpr('</mark>', match_span_content, perl = TRUE)
				1071	mark_end_positions <- mark_end_gre[[1]]
				1072	mark_end <- if (!is.null(mark_end_positions) && length(mark_end_positions) > 0 && mark_end_positions[1] != -1)
				1073	mark_end_positions[length(mark_end_positions)] else -1
				1074	mark_end_len <- if (mark_end != -1) attr(mark_end_gre[[1]], "match.length")[length(mark_end_positions)] else 0
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1075
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1076	if (mark_start > 0 && mark_end > 0) {
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1077	# Left context: everything before first <mark>
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1078	left_content <- substr(match_span_content, 1, mark_start - 1)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1079
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1080	# Match content: everything between first <mark> and last </mark>
				1081	match_content <- substr(match_span_content, mark_start, mark_end + mark_end_len - 1)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1082
Marc Kupietz	560b591	2025-09-01 17:36:13 +0200	[diff] [blame]	1083	# Right context: everything after last </mark>
				1084	right_content_start <- mark_end + mark_end_len
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1085	right_content <- substr(match_span_content, right_content_start, nchar(match_span_content))
				1086	} else {
				1087	# No mark tags found, treat entire match span as match content
				1088	left_content <- ""
				1089	match_content <- match_span_content
				1090	right_content <- ""
				1091	}
				1092	} else {
				1093	# No match span found, treat entire content as match
				1094	left_content <- ""
				1095	match_content <- xml_snippet
				1096	right_content <- ""
				1097	}
				1098
				1099	# Process each section
				1100	left_annotations <- extract_annotations_from_section(left_content)
				1101	match_annotations <- extract_annotations_from_section(match_content)
				1102	right_annotations <- extract_annotations_from_section(right_content)
				1103
				1104	return(list(
				1105	atokens = list(
				1106	left = left_annotations$tokens,
				1107	match = match_annotations$tokens,
				1108	right = right_annotations$tokens
				1109	),
				1110	lemma = list(
				1111	left = left_annotations$lemmas,
				1112	match = match_annotations$lemmas,
				1113	right = right_annotations$lemmas
				1114	),
				1115	pos = list(
				1116	left = left_annotations$pos_tags,
				1117	match = match_annotations$pos_tags,
				1118	right = right_annotations$pos_tags
				1119	),
				1120	morph = list(
				1121	left = left_annotations$morph_tags,
				1122	match = match_annotations$morph_tags,
				1123	right = right_annotations$morph_tags
				1124	)
				1125	))
				1126	}
				1127
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1128	#' Fetch annotations for all collected matches
				1129	#'
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1130	#' `r lifecycle::badge("experimental")`
				1131	#'
				1132	#' `fetchAnnotations` fetches annotations (only token annotations, for now)
				1133	#' for all matches in the `@collectedMatches` slot
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1134	#' of a KorAPQuery object and adds annotation columns directly to the `@collectedMatches`
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1135	#' data frame. The method uses the `matchID` from collected matches.
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1136	#'
				1137	#' Important: For copyright-restricted corpora, users must be authorized via [auth()]
				1138	#' and the initial corpus query must have `metadataOnly = FALSE` to ensure snippets are
				1139	#' available for annotation parsing.
				1140	#'
				1141	#' The method parses XML snippet annotations and adds linguistic columns to the data frame:
				1142	#' - `pos`: data frame with `left`, `match`, `right` columns, each containing list vectors of part-of-speech tags
				1143	#' - `lemma`: data frame with `left`, `match`, `right` columns, each containing list vectors of lemmas
				1144	#' - `morph`: data frame with `left`, `match`, `right` columns, each containing list vectors of morphological tags
				1145	#' - `atokens`: data frame with `left`, `match`, `right` columns, each containing list vectors of token text (from annotations)
				1146	#' - `annotation_snippet`: original XML snippet from the annotation API
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1147	#'
				1148	#' @family corpus search functions
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1149	#' @concept Annotations
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1150	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1151	#' @param kqo object obtained from [corpusQuery()] with collected matches. Note: the original corpus query should have `metadataOnly = FALSE` for annotation parsing to work.
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1152	#' @param foundry string specifying the foundry to use for annotations (default: "tt" for Tree-Tagger)
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1153	#' @param overwrite logical; if TRUE, re-fetch and replace any existing
				1154	#' annotation columns. If FALSE (default), only add missing annotation layers
				1155	#' and preserve already fetched ones (e.g., keep POS/lemma from a previous
				1156	#' foundry while adding morph from another).
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1157	#' @param verbose print progress information if true
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1158	#' @return The updated `kqo` object with annotation columns
Marc Kupietz	336c85d	2025-07-24 13:52:03 +0200	[diff] [blame^]	1159	#' @return The updated `kqo` object with annotation columns
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1160	#' like `pos`, `lemma`, `morph` (and `atokens` and `annotation_snippet`)
				1161	#' in the `@collectedMatches` slot. Each column is a data frame
				1162	#' with `left`, `match`, and `right` columns containing list vectors of annotations
				1163	#' for the left context, matched tokens, and right context, respectively.
				1164	#' The original XML snippet for each match is also stored in `annotation_snippet`.
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1165	#'
				1166	#' @examples
				1167	#' \dontrun{
				1168	#'
				1169	#' # Fetch annotations for matches using Tree-Tagger foundry
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1170	#' # Note: Authorization required for copyright-restricted corpora
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1171	#' q <- KorAPConnection() \|>
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1172	#' auth() \|>
				1173	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1174	#' fetchNext(maxFetch = 10) \|>
				1175	#' fetchAnnotations()
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1176	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1177	#' # Access linguistic annotations for match i:
Marc Kupietz	6aa5a0d	2025-09-08 17:51:47 +0200	[diff] [blame]	1178	#' pos_tags <- q@collectedMatches$pos
				1179	#' # Data frame with left/match/right columns for POS tags
				1180	#' lemmas <- q@collectedMatches$lemma
				1181	#' # Data frame with left/match/right columns for lemmas
				1182	#' morphology <- q@collectedMatches$morph
				1183	#' # Data frame with left/match/right columns for morphological tags
				1184	#' atokens <- q@collectedMatches$atokens
				1185	#' # Data frame with left/match/right columns for annotation token text
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1186	#' # Original XML snippet for match i
				1187	#' raw_snippet <- q@collectedMatches$annotation_snippet[[i]]
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1188	#'
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1189	#' # Access specific components:
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1190	#' # POS tags for the matched tokens in match i
				1191	#' match_pos <- q@collectedMatches$pos$match[[i]]
				1192	#' # Lemmas for the left context in match i
				1193	#' left_lemmas <- q@collectedMatches$lemma$left[[i]]
				1194	#' # Token text for the right context in match i
				1195	#' right_tokens <- q@collectedMatches$atokens$right[[i]]
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1196	#'
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1197	#' # Use a different foundry (e.g., MarMoT)
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1198	#' q <- KorAPConnection() \|>
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1199	#' auth() \|>
				1200	#' corpusQuery("Ameisenplage", metadataOnly = FALSE) \|>
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1201	#' fetchNext(maxFetch = 10) \|>
Marc Kupietz	89f796e	2025-07-19 09:05:06 +0200	[diff] [blame]	1202	#' fetchAnnotations(foundry = "marmot")
				1203	#' q@collectedMatches$pos$left[1] # POS tags for the left context of the first match
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1204	#' }
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1205	#' @export
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1206	setMethod("fetchAnnotations", "KorAPQuery", function(kqo,
				1207	foundry = "tt",
				1208	overwrite = FALSE,
				1209	verbose = kqo@korapConnection@verbose) {
				1210	if (is.null(kqo@collectedMatches) \|\|
				1211	nrow(kqo@collectedMatches) == 0) {
				1212	warning("No collected matches found. Please run fetchNext() or fetchAll() first.")
				1213	return(kqo)
				1214	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1215
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1216	df <- kqo@collectedMatches
				1217	kco <- kqo@korapConnection
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1218
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1219	# Initialize annotation columns as data frames (like tokens field)
				1220	# Create the structure more explicitly to avoid assignment issues
				1221	nrows <- nrow(df)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1222
Marc Kupietz	03d2b1a	2025-07-19 09:14:45 +0200	[diff] [blame]	1223	# Pre-compute the empty character vector list to avoid repeated computation
				1224	empty_char_list <- I(replicate(nrows, character(0), simplify = FALSE))
Marc Kupietz	0af7593	2025-09-09 18:14:16 +0200	[diff] [blame]	1225
Marc Kupietz	03d2b1a	2025-07-19 09:14:45 +0200	[diff] [blame]	1226	# Helper function to create annotation data frame structure
				1227	create_annotation_df <- function(empty_list) {
				1228	data.frame(
				1229	left = empty_list,
				1230	match = empty_list,
				1231	right = empty_list,
				1232	stringsAsFactors = FALSE
				1233	)
				1234	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1235
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1236	# Track which annotation columns already existed to decide overwrite behavior
				1237	existing_types <- list(
				1238	pos = "pos" %in% colnames(df),
				1239	lemma = "lemma" %in% colnames(df),
				1240	morph = "morph" %in% colnames(df),
				1241	atokens = "atokens" %in% colnames(df),
				1242	annotation_snippet = "annotation_snippet" %in% colnames(df)
				1243	)
				1244
				1245	# Initialize annotation columns using the helper function
Marc Kupietz	03d2b1a	2025-07-19 09:14:45 +0200	[diff] [blame]	1246	annotation_types <- c("pos", "lemma", "morph", "atokens")
				1247	for (type in annotation_types) {
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1248	if (overwrite \|\| !existing_types[[type]]) {
				1249	df[[type]] <- create_annotation_df(empty_char_list)
				1250	}
Marc Kupietz	03d2b1a	2025-07-19 09:14:45 +0200	[diff] [blame]	1251	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1252
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1253	if (overwrite \|\| !existing_types$annotation_snippet) {
feldmueller	a02f193	2025-09-15 16:38:06 +0200	[diff] [blame]	1254	df$annotation_snippet <- rep(NA_character_, nrows) # Fixed line
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1255	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1256
Marc Kupietz	e8c0fef	2025-07-18 19:59:04 +0200	[diff] [blame]	1257	# Initialize timing for ETA calculation
				1258	start_time <- Sys.time()
				1259	if (verbose) {
				1260	log_info(verbose, paste("Starting to fetch annotations for", nrows, "matches\n"))
				1261	}
				1262
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1263	# Helper to decide if existing annotation row is effectively empty
				1264	is_empty_annotation_row <- function(ann_df, row_index) {
				1265	if (is.null(ann_df) \|\| nrow(ann_df) < row_index) return(TRUE)
				1266	left_val <- ann_df$left[[row_index]]
				1267	match_val <- ann_df$match[[row_index]]
				1268	right_val <- ann_df$right[[row_index]]
				1269	all(
				1270	(is.null(left_val) \|\| (length(left_val) == 0) \|\| all(is.na(left_val))),
				1271	(is.null(match_val) \|\| (length(match_val) == 0) \|\| all(is.na(match_val))),
				1272	(is.null(right_val) \|\| (length(right_val) == 0) \|\| all(is.na(right_val)))
				1273	)
				1274	}
				1275
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1276	for (i in seq_len(nrow(df))) {
Marc Kupietz	e8c0fef	2025-07-18 19:59:04 +0200	[diff] [blame]	1277	# ETA logging
				1278	if (verbose && i > 1) {
				1279	eta_info <- calculate_eta(i, nrows, start_time)
				1280	log_info(verbose, paste("Fetching annotations for match", i, "of", nrows, eta_info, "\n"))
				1281	}
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1282	# Use matchID if available, otherwise fall back to constructing from matchStart/matchEnd
				1283	if ("matchID" %in% colnames(df) && !is.na(df$matchID[i])) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1284	# matchID format: "match-match-A00/JUN/39609-p202-203" or encrypted format like
				1285	# "match-DNB10/CSL/80400-p2343-2344x_MinDOhu_P6dd2MMZJyyus_7MairdKnr1LxY07Cya-Ow"
				1286	# Extract document path and position, handling both regular and encrypted formats
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1287
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1288	# More flexible regex to extract the document path with position and encryption
				1289	# Look for pattern: match-(...)-p(\d+)-(\d+)(.) where (.) is the encrypted part
				1290	# We need to capture the entire path including the encrypted suffix
				1291	match_result <- regexpr("match-(.+?-p\\d+-\\d+.*)", df$matchID[i], perl = TRUE)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1292
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1293	if (match_result > 0) {
				1294	# Extract the complete path including encryption (everything after "match-")
				1295	doc_path_with_pos_and_encryption <- gsub("^match-(.+)$", "\\1", df$matchID[i], perl = TRUE)
				1296	# Convert the dash before position to slash, but keep everything after the position
				1297	match_path <- gsub("-p(\\d+-\\d+.*)", "/p\\1", doc_path_with_pos_and_encryption)
Marc Kupietz	2512130	2025-07-19 08:45:43 +0200	[diff] [blame]	1298	# Use httr2 to construct URL safely
				1299	base_url <- paste0(kco@apiUrl, "corpus/", match_path)
				1300	req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1301	} else {
Marc Kupietz	2512130	2025-07-19 08:45:43 +0200	[diff] [blame]	1302	# If regex fails, fall back to the old method with httr2
				1303	# Format numbers to avoid scientific notation
				1304	match_start <- format(df$matchStart[i], scientific = FALSE)
				1305	match_end <- format(df$matchEnd[i], scientific = FALSE)
				1306	base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
				1307	req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1308	}
				1309	} else {
Marc Kupietz	2512130	2025-07-19 08:45:43 +0200	[diff] [blame]	1310	# Fallback to the old method with httr2
				1311	# Format numbers to avoid scientific notation
				1312	match_start <- format(df$matchStart[i], scientific = FALSE)
				1313	match_end <- format(df$matchEnd[i], scientific = FALSE)
				1314	base_url <- paste0(kco@apiUrl, "corpus/", df$textSigle[i], "/", "p", match_start, "-", match_end)
				1315	req <- httr2::url_modify(base_url, query = list(foundry = foundry))
Marc Kupietz	ff712a9	2025-07-18 09:07:23 +0200	[diff] [blame]	1316	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1317
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1318	tryCatch({
				1319	res <- apiCall(kco, req)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1320
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1321	if (!is.null(res)) {
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1322	# Store the raw annotation snippet (respect overwrite flag)
				1323	if (overwrite \|\| !existing_types$annotation_snippet \|\| is.null(df$annotation_snippet[[i]]) \|\| is.na(df$annotation_snippet[[i]])) {
				1324	df$annotation_snippet[[i]] <- if (is.list(res) && "snippet" %in% names(res)) res$snippet else NA
				1325	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1326
				1327	# Parse XML annotations if snippet is available
				1328	if (is.list(res) && "snippet" %in% names(res)) {
				1329	parsed_annotations <- parse_xml_annotations_structured(res$snippet)
				1330
				1331	# Store the parsed linguistic data in data frame format (like tokens)
				1332	# Use individual assignment to avoid data frame mismatch errors
				1333	tryCatch({
				1334	# Assign POS annotations
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1335	if (overwrite \|\| !existing_types$pos \|\| is_empty_annotation_row(df$pos, i)) {
				1336	df$pos$left[i] <- list(parsed_annotations$pos$left)
				1337	df$pos$match[i] <- list(parsed_annotations$pos$match)
				1338	df$pos$right[i] <- list(parsed_annotations$pos$right)
				1339	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1340
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1341	# Assign lemma annotations
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1342	if (overwrite \|\| !existing_types$lemma \|\| is_empty_annotation_row(df$lemma, i)) {
				1343	df$lemma$left[i] <- list(parsed_annotations$lemma$left)
				1344	df$lemma$match[i] <- list(parsed_annotations$lemma$match)
				1345	df$lemma$right[i] <- list(parsed_annotations$lemma$right)
				1346	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1347
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1348	# Assign morphology annotations
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1349	if (overwrite \|\| !existing_types$morph \|\| is_empty_annotation_row(df$morph, i)) {
				1350	df$morph$left[i] <- list(parsed_annotations$morph$left)
				1351	df$morph$match[i] <- list(parsed_annotations$morph$match)
				1352	df$morph$right[i] <- list(parsed_annotations$morph$right)
				1353	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1354
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1355	# Assign token annotations
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1356	if (overwrite \|\| !existing_types$atokens \|\| is_empty_annotation_row(df$atokens, i)) {
				1357	df$atokens$left[i] <- list(parsed_annotations$atokens$left)
				1358	df$atokens$match[i] <- list(parsed_annotations$atokens$match)
				1359	df$atokens$right[i] <- list(parsed_annotations$atokens$right)
				1360	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1361	}, error = function(assign_error) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1362	# Set empty character vectors on assignment error using list assignment
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1363	if (overwrite \|\| !existing_types$pos) {
				1364	df$pos$left[i] <<- list(character(0))
				1365	df$pos$match[i] <<- list(character(0))
				1366	df$pos$right[i] <<- list(character(0))
				1367	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1368
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1369	if (overwrite \|\| !existing_types$lemma) {
				1370	df$lemma$left[i] <<- list(character(0))
				1371	df$lemma$match[i] <<- list(character(0))
				1372	df$lemma$right[i] <<- list(character(0))
				1373	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1374
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1375	if (overwrite \|\| !existing_types$morph) {
				1376	df$morph$left[i] <<- list(character(0))
				1377	df$morph$match[i] <<- list(character(0))
				1378	df$morph$right[i] <<- list(character(0))
				1379	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1380
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1381	if (overwrite \|\| !existing_types$atokens) {
				1382	df$atokens$left[i] <<- list(character(0))
				1383	df$atokens$match[i] <<- list(character(0))
				1384	df$atokens$right[i] <<- list(character(0))
				1385	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1386	})
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1387	} else {
				1388	# No snippet available, store empty vectors
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1389	if (overwrite \|\| !existing_types$pos) {
				1390	df$pos$left[i] <- list(character(0))
				1391	df$pos$match[i] <- list(character(0))
				1392	df$pos$right[i] <- list(character(0))
				1393	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1394
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1395	if (overwrite \|\| !existing_types$lemma) {
				1396	df$lemma$left[i] <- list(character(0))
				1397	df$lemma$match[i] <- list(character(0))
				1398	df$lemma$right[i] <- list(character(0))
				1399	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1400
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1401	if (overwrite \|\| !existing_types$morph) {
				1402	df$morph$left[i] <- list(character(0))
				1403	df$morph$match[i] <- list(character(0))
				1404	df$morph$right[i] <- list(character(0))
				1405	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1406
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1407	if (overwrite \|\| !existing_types$atokens) {
				1408	df$atokens$left[i] <- list(character(0))
				1409	df$atokens$match[i] <- list(character(0))
				1410	df$atokens$right[i] <- list(character(0))
				1411	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1412	}
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1413	} else {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1414	# Store NAs for failed requests
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1415	if (overwrite \|\| !existing_types$pos) {
				1416	df$pos$left[i] <- list(NA)
				1417	df$pos$match[i] <- list(NA)
				1418	df$pos$right[i] <- list(NA)
				1419	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1420
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1421	if (overwrite \|\| !existing_types$lemma) {
				1422	df$lemma$left[i] <- list(NA)
				1423	df$lemma$match[i] <- list(NA)
				1424	df$lemma$right[i] <- list(NA)
				1425	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1426
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1427	if (overwrite \|\| !existing_types$morph) {
				1428	df$morph$left[i] <- list(NA)
				1429	df$morph$match[i] <- list(NA)
				1430	df$morph$right[i] <- list(NA)
				1431	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1432
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1433	if (overwrite \|\| !existing_types$atokens) {
				1434	df$atokens$left[i] <- list(NA)
				1435	df$atokens$match[i] <- list(NA)
				1436	df$atokens$right[i] <- list(NA)
				1437	}
				1438	if (overwrite \|\| !existing_types$annotation_snippet) {
				1439	df$annotation_snippet[[i]] <- NA
				1440	}
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1441	}
				1442	}, error = function(e) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1443	# Store NAs for failed requests
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1444	if (overwrite \|\| !existing_types$pos) {
				1445	df$pos$left[i] <- list(NA)
				1446	df$pos$match[i] <- list(NA)
				1447	df$pos$right[i] <- list(NA)
				1448	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1449
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1450	if (overwrite \|\| !existing_types$lemma) {
				1451	df$lemma$left[i] <- list(NA)
				1452	df$lemma$match[i] <- list(NA)
				1453	df$lemma$right[i] <- list(NA)
				1454	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1455
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1456	if (overwrite \|\| !existing_types$morph) {
				1457	df$morph$left[i] <- list(NA)
				1458	df$morph$match[i] <- list(NA)
				1459	df$morph$right[i] <- list(NA)
				1460	}
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1461
Marc Kupietz	93787d5	2025-09-03 13:33:25 +0200	[diff] [blame]	1462	if (overwrite \|\| !existing_types$atokens) {
				1463	df$atokens$left[i] <- list(NA)
				1464	df$atokens$match[i] <- list(NA)
				1465	df$atokens$right[i] <- list(NA)
				1466	}
				1467	if (overwrite \|\| !existing_types$annotation_snippet) {
				1468	df$annotation_snippet[[i]] <- NA
				1469	}
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1470	})
				1471	}
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1472
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1473	# Validate data frame structure before assignment
				1474	if (nrow(df) != nrow(kqo@collectedMatches)) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1475	}
				1476
				1477	# Update the collectedMatches with annotation data
				1478	tryCatch({
				1479	kqo@collectedMatches <- df
				1480	}, error = function(assign_error) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1481	# Try a safer approach: add columns individually
				1482	tryCatch({
				1483	kqo@collectedMatches$pos <- df$pos
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1484	kqo@collectedMatches$lemma <- df$lemma
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1485	kqo@collectedMatches$morph <- df$morph
				1486	kqo@collectedMatches$atokens <- df$atokens
				1487	kqo@collectedMatches$annotation_snippet <- df$annotation_snippet
				1488	}, error = function(col_error) {
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1489	warning("Failed to add annotation data to collectedMatches")
				1490	})
				1491	})
				1492
Marc Kupietz	e8c0fef	2025-07-18 19:59:04 +0200	[diff] [blame]	1493	if (verbose) {
				1494	elapsed_time <- Sys.time() - start_time
				1495	log_info(verbose, paste("Finished fetching annotations for", nrows, "matches in", format_duration(as.numeric(elapsed_time, units = "secs")), "\n"))
				1496	}
				1497
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1498	return(kqo)
				1499	})
				1500
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1501	#' Query frequencies of search expressions in virtual corpora
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1502	#'
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	1503	#' `frequencyQuery` combines [corpusQuery()], [corpusStats()] and
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1504	#' [ci()] to compute a tibble with the absolute and relative frequencies and
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1505	#' confidence intervals of one ore multiple search terms across one or multiple
				1506	#' virtual corpora.
				1507	#'
Marc Kupietz	a8c40f4	2025-06-24 15:49:52 +0200	[diff] [blame]	1508	#' @family frequency analysis
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1509	#' @aliases frequencyQuery
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1510	#' @examples
Marc Kupietz	6ae7605	2021-09-21 10:34:00 +0200	[diff] [blame]	1511	#' \dontrun{
				1512	#'
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1513	#' KorAPConnection(verbose = TRUE) \|>
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1514	#' frequencyQuery(c("Mücke", "Schnake"), paste0("pubDate in ", 2000:2003))
Marc Kupietz	05b2277	2020-02-18 21:58:42 +0100	[diff] [blame]	1515	#' }
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1516	#'
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1517	# @inheritParams corpusQuery
Marc Kupietz	617266d	2025-02-27 10:43:07 +0100	[diff] [blame]	1518	#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1519	#' @param query corpus query string(s.) (can be a vector). The query language depends on the `ql` parameter. Either `query` must be provided or `KorAPUrl`.
				1520	#' @param vc virtual corpus definition(s) (can be a vector)
Marc Kupietz	67edcb5	2021-09-20 21:54:24 +0200	[diff] [blame]	1521	#' @param conf.level confidence level of the returned confidence interval (passed through [ci()] to [prop.test()]).
				1522	#' @param as.alternatives LOGICAL that specifies if the query terms should be treated as alternatives. If `as.alternatives` is TRUE, the sum over all query hits, instead of the respective vc token sizes is used as total for the calculation of relative frequencies.
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1523	#' @param ... further arguments passed to or from other methods (see [corpusQuery()]), most notably `expand`, a logical that decides if `query` and `vc` parameters are expanded to all of their combinations. It defaults to `TRUE`, if `query` and `vc` have different lengths, and to `FALSE` otherwise.
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1524	#' @export
Marc Kupietz	ad8d2ed	2025-04-05 15:37:38 +0200	[diff] [blame]	1525	#'
				1526	#' @return A tibble, with each row containing the following result columns for query and vc combinations:
				1527	#' - query: the query string used for the frequency analysis.
				1528	#' - totalResults: absolute frequency of query matches in the vc.
				1529	#' - vc: virtual corpus used for the query.
				1530	#' - webUIRequestUrl: URL of the corresponding web UI request with respect to query and vc.
				1531	#' - total: total number of words in vc.
				1532	#' - f: relative frequency of query matches in the vc.
				1533	#' - conf.low: lower bound of the confidence interval for the relative frequency, given `conf.level`.
				1534	#' - conf.high: upper bound of the confidence interval for the relative frequency, given `conf.level`.
				1535
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1536	setMethod(
				1537	"frequencyQuery", "KorAPConnection",
Marc Kupietz	71d6e05	2019-11-22 18:42:10 +0100	[diff] [blame]	1538	function(kco, query, vc = "", conf.level = 0.95, as.alternatives = FALSE, ...) {
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1539	(if (as.alternatives) {
				1540	corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) \|>
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	1541	group_by(vc) \|>
Marc Kupietz	71d6e05	2019-11-22 18:42:10 +0100	[diff] [blame]	1542	mutate(total = sum(totalResults))
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1543	} else {
				1544	corpusQuery(kco, query, vc, metadataOnly = TRUE, as.df = TRUE, ...) \|>
				1545	mutate(total = corpusStats(kco, vc = vc, as.df = TRUE)$tokens)
Marc Kupietz	ea34b81	2025-06-25 15:49:00 +0200	[diff] [blame]	1546	}) \|>
Marc Kupietz	0c29cea	2019-10-09 08:44:36 +0200	[diff] [blame]	1547	ci(conf.level = conf.level)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1548	}
				1549	)
Marc Kupietz	3f57528	2019-10-04 14:46:04 +0200	[diff] [blame]	1550
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1551	#' buildWebUIRequestUrlFromString
				1552	#'
				1553	#' @rdname KorAPQuery-class
				1554	#' @importFrom urltools url_encode
				1555	#' @export
				1556	buildWebUIRequestUrlFromString <- function(KorAPUrl,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1557	query,
				1558	vc = "",
				1559	ql = "poliqarp") {
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1560	if ("KorAPConnection" %in% class(KorAPUrl)) {
				1561	KorAPUrl <- KorAPUrl@KorAPUrl
				1562	}
				1563
				1564	request <-
				1565	paste0(
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1566	"?q=",
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1567	urltools::url_encode(enc2utf8(as.character(query))),
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1568	ifelse(vc != "",
				1569	paste0("&cq=", urltools::url_encode(enc2utf8(vc))),
				1570	""
				1571	),
				1572	"&ql=",
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1573	ql
				1574	)
				1575	paste0(KorAPUrl, request)
				1576	}
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1577
				1578	#' buildWebUIRequestUrl
				1579	#'
				1580	#' @rdname KorAPQuery-class
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1581	#' @importFrom httr2 url_parse
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1582	#' @export
				1583	buildWebUIRequestUrl <- function(kco,
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1584	query = if (missing(KorAPUrl)) {
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1585	stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1586	} else {
				1587	httr2::url_parse(KorAPUrl)$query$q
				1588	},
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1589	vc = if (missing(KorAPUrl)) "" else httr2::url_parse(KorAPUrl)$query$cq,
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1590	KorAPUrl,
Marc Kupietz	f912959	2025-01-26 19:17:54 +0100	[diff] [blame]	1591	ql = if (missing(KorAPUrl)) "poliqarp" else httr2::url_parse(KorAPUrl)$query$ql) {
Marc Kupietz	38a9d68	2024-12-06 16:17:09 +0100	[diff] [blame]	1592	buildWebUIRequestUrlFromString(kco@KorAPUrl, query, vc, ql)
Marc Kupietz	dbd431a	2021-08-29 12:17:45 +0200	[diff] [blame]	1593	}
				1594
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1595	#' format()
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1596	#' @rdname KorAPQuery-class
				1597	#' @param x KorAPQuery object
				1598	#' @param ... further arguments passed to or from other methods
Marc Kupietz	b73ca0f	2025-01-28 20:45:01 +0100	[diff] [blame]	1599	#' @importFrom urltools param_get url_decode
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1600	#' @export
				1601	format.KorAPQuery <- function(x, ...) {
				1602	cat("<KorAPQuery>\n")
				1603	q <- x
Marc Kupietz	d885122	2025-05-01 10:57:19 +0200	[diff] [blame]	1604	param <- urltools::param_get(q@request) \|> lapply(urltools::url_decode)
Marc Kupietz	b73ca0f	2025-01-28 20:45:01 +0100	[diff] [blame]	1605	cat(" Query: ", param$q, "\n")
				1606	if (!is.null(param$cq) && param$cq != "") {
				1607	cat(" Virtual corpus: ", param$cq, "\n")
				1608	}
				1609	if (!is.null(q@collectedMatches)) {
				1610	cat("==============================================================================================================", "\n")
				1611	print(summary(q@collectedMatches))
				1612	cat("==============================================================================================================", "\n")
				1613	}
				1614	cat(" Total results: ", q@totalResults, "\n")
				1615	cat(" Fetched results: ", q@nextStartIndex, "\n")
Marc Kupietz	a29f3d4	2025-07-18 10:14:43 +0200	[diff] [blame]	1616	if (!is.null(q@collectedMatches) && "pos" %in% colnames(q@collectedMatches)) {
				1617	successful_annotations <- sum(!is.na(q@collectedMatches$annotation_snippet))
				1618	parsed_annotations <- sum(!is.na(q@collectedMatches$pos))
				1619	cat(" Annotations: ", successful_annotations, " of ", nrow(q@collectedMatches), " matches")
				1620	if (parsed_annotations > 0) {
				1621	cat(" (", parsed_annotations, " with parsed linguistic data)")
				1622	}
				1623	cat("\n")
Marc Kupietz	e52b295	2025-07-17 16:53:02 +0200	[diff] [blame]	1624	}
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1625	}
				1626
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1627	#' show()
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1628	#'
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1629	#' @rdname KorAPQuery-class
				1630	#' @param object KorAPQuery object
Marc Kupietz	62da2b5	2019-09-12 17:43:34 +0200	[diff] [blame]	1631	#' @export
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1632	setMethod("show", "KorAPQuery", function(object) {
				1633	format(object)
Marc Kupietz	c643a12	2025-07-18 18:18:36 +0200	[diff] [blame]	1634	invisible(object)
Marc Kupietz	e95108e	2019-09-18 13:23:58 +0200	[diff] [blame]	1635	})