Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 1 | #' @import jsonlite |
| 2 | #' @import curl |
| 3 | |
| 4 | library(jsonlite) |
| 5 | |
| 6 | defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace", |
| 7 | "availability", "textClass") |
| 8 | |
| 9 | derekoStats <- function(vc='') { |
| 10 | return(fromJSON(paste0(apiurl, 'statistics?cq=', |
| 11 | URLencode(vc, reserved=TRUE)))) |
| 12 | } |
| 13 | |
| 14 | KorAPQuery <- function(con, query, vc="", ql="poliqarp", fields=defaultFields) { |
| 15 | request <- paste0('?q=', URLencode(query, reserved=TRUE), |
| 16 | ifelse(vc != '', paste0('&cq=', URLencode(vc, reserved=TRUE)), ''), |
| 17 | '&ql=', ql); |
| 18 | webUIRequestUrl <- paste0(con$KorAPUrl, request) |
| 19 | requestUrl <- paste0(con$apiUrl, 'search', request, |
| 20 | '&fields=', paste(defaultFields, collapse = ","), |
| 21 | '&access-rewrite-disabled=true') |
| 22 | result <- fromJSON(paste0(requestUrl, '&count=1')) |
| 23 | |
| 24 | result$fields <- fields |
| 25 | result$requestUrl <- requestUrl |
| 26 | result$request <- request |
| 27 | result$webUIRequestUrl <- webUIRequestUrl |
Marc Kupietz | cb725f8 | 2019-08-30 18:04:57 +0200 | [diff] [blame^] | 28 | result$nextStartIndex <- 0 |
| 29 | result$hasMoreMatches <- (result$meta$totalResults > 0) |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 30 | return(result) |
| 31 | } |
| 32 | |
Marc Kupietz | 3156991 | 2019-08-30 16:53:04 +0200 | [diff] [blame] | 33 | KorAPFetchAll <- function(query, verbose=FALSE) { |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 34 | if (query$meta$totalResults == 0) { return(data.frame()) } |
| 35 | |
| 36 | page <- 1 |
| 37 | results <- 0 |
| 38 | |
| 39 | repeat { |
Marc Kupietz | 9392d5d | 2019-08-30 16:48:50 +0200 | [diff] [blame] | 40 | res <- fromJSON(paste0(query$requestUrl, '&count=50&offset=', results)) |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 41 | if (res$meta$totalResults == 0) { return(data.frame()) } |
| 42 | for (field in query$fields) { |
| 43 | if (!field %in% colnames(res$matches)) { |
| 44 | res$matches[, field] <- NA |
| 45 | } |
| 46 | } |
| 47 | currentMatches <- res$matches[query$fields] |
| 48 | factorCols <- colnames(subset(currentMatches, select=-c(pubDate))) |
| 49 | currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor) |
| 50 | currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d") |
Marc Kupietz | 9392d5d | 2019-08-30 16:48:50 +0200 | [diff] [blame] | 51 | if (results == 0) { |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 52 | allMatches <- currentMatches |
| 53 | expectedResults <- res$meta$totalResults |
| 54 | } else { |
| 55 | allMatches <- rbind(allMatches, currentMatches) |
| 56 | } |
Marc Kupietz | c2c59bd | 2019-08-30 16:50:49 +0200 | [diff] [blame] | 57 | if (verbose) { |
| 58 | cat(paste0("Retrieved page: ", page, "/", ceiling(expectedResults / res$meta$itemsPerPage), ': ', res$meta$benchmark, '\n')) |
| 59 | } |
Marc Kupietz | 5bbc9db | 2019-08-30 16:30:45 +0200 | [diff] [blame] | 60 | page <- page + 1 |
| 61 | results <- results + res$meta$itemsPerPage |
| 62 | if (results >= expectedResults) { |
| 63 | break |
| 64 | } |
| 65 | } |
| 66 | return(allMatches) |
| 67 | } |
Marc Kupietz | cb725f8 | 2019-08-30 18:04:57 +0200 | [diff] [blame^] | 68 | |
| 69 | KorAPFetchNext <- function(query, offset=query$nextStartIndex, verbose=FALSE) { |
| 70 | if (query$nextStartIndex >= query$meta$totalResults) { |
| 71 | query$hasMoreMatches <- FALSE |
| 72 | return(query) |
| 73 | } |
| 74 | |
| 75 | res <- fromJSON(paste0(query$requestUrl, '&count=50&offset=', offset)) |
| 76 | for (field in query$fields) { |
| 77 | if (!field %in% colnames(res$matches)) { |
| 78 | res$matches[, field] <- NA |
| 79 | } |
| 80 | } |
| 81 | currentMatches <- res$matches[query$fields] |
| 82 | factorCols <- colnames(subset(currentMatches, select=-c(pubDate))) |
| 83 | currentMatches[factorCols] <- lapply(currentMatches[factorCols], factor) |
| 84 | currentMatches$pubDate = as.Date(currentMatches$pubDate, format = "%Y-%m-%d") |
| 85 | if (offset == 0) { |
| 86 | res$collectedMatches <- currentMatches |
| 87 | } else { |
| 88 | res$collectedMatches <- rbind(query$collectedMatches, currentMatches) |
| 89 | } |
| 90 | if (verbose) { |
| 91 | cat(paste0("Retrieved page in ", res$meta$benchmark, '\n')) |
| 92 | } |
| 93 | res$nextStartIndex <- res$meta$startIndex + res$meta$itemsPerPage |
| 94 | res$fields <- query$fields |
| 95 | res$requestUrl <- query$requestUrl |
| 96 | res$request <- query$request |
| 97 | res$webUIRequestUrl <- query$webUIRequestUrl |
| 98 | res$hasMoreMatches <- (res$meta$totalResults > res$nextStartIndex) |
| 99 | |
| 100 | return(res) |
| 101 | } |