Add matchStart and matchEnd columns to collectedMatches in corpusQuery result
Resolves #22
Change-Id: I6af9de503e5911cbe5c566b0fae529cfba7b764c
diff --git a/NAMESPACE b/NAMESPACE
index 42fe68e..534fbd3 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -109,6 +109,7 @@
importFrom(stringr,str_detect)
importFrom(stringr,str_match)
importFrom(stringr,str_split)
+importFrom(stringr,word)
importFrom(tibble,add_column)
importFrom(tibble,as_tibble)
importFrom(tibble,enframe)
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 7abed6a..432dea4 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -276,7 +276,8 @@
#' @aliases fetchNext
#' @rdname KorAPQuery-class
#' @importFrom dplyr rowwise mutate bind_rows select summarise n select
-#' @importFrom tibble enframe
+#' @importFrom tibble enframe add_column
+#' @importFrom stringr word
#' @importFrom tidyr unnest unchop pivot_wider
#' @importFrom purrr map
#' @export
@@ -330,7 +331,15 @@
currentMatches[, field] <- NA
}
}
- currentMatches <- currentMatches %>% select(kqo@fields)
+ currentMatches <- currentMatches %>%
+ select(kqo@fields) %>%
+ mutate(
+ tmp_positions = gsub(".*-p(\\d+)-(\\d+)", "\\1 \\2", res$matches$matchID),
+ matchStart = as.integer(stringr::word(tmp_positions, 1)),
+ matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
+ ) %>%
+ select(-tmp_positions)
+
if (!is.list(collectedMatches)) {
collectedMatches <- currentMatches
} else {
diff --git a/demo/00Index b/demo/00Index
index 3e9fbf2..426b8b6 100644
--- a/demo/00Index
+++ b/demo/00Index
@@ -15,3 +15,4 @@
collocation_score_by_country Plot collocation scores depending on country of publication using ggplot2.
pluralGenderVariants Plot frequencies of the plural gender variants of a word in the core corpus of the Council for German Orthography (OKK) over time.
adjectiveCollocates Runtime OAuth2 browser flow demonstration using the httr2 package showing adjective collocates of 'Gendern'
+relativeTextpositionBoxplot Plot the relative text positions of a some adverbs as highcharter boxplot.
diff --git a/demo/relativeTextpositionBoxplot.R b/demo/relativeTextpositionBoxplot.R
new file mode 100644
index 0000000..02823da
--- /dev/null
+++ b/demo/relativeTextpositionBoxplot.R
@@ -0,0 +1,57 @@
+library(RKorAPClient)
+library(highcharter)
+library(tidyverse)
+
+kco <- new("KorAPConnection", verbose = TRUE)
+
+set.seed(7)
+
+get_relative_positions_sample <- function(kco, query, sampleSize = 400) {
+ res <- corpusQuery(kco, query)
+ res <- fetchNext(res, maxFetch = sampleSize, randomizePageOrder = TRUE)
+ matches <- res@collectedMatches
+ matches <- matches %>%
+ mutate(
+ query = query,
+ vc = paste0('textSigle="', textSigle, '"'),
+ textSize = corpusStats(kco, vc, as.df = TRUE)$tokens,
+ relativeTextPosition = matchStart / textSize
+ )
+ cat("\n\n", query, ":\n")
+ print(summary(matches$relativeTextPosition))
+ cat("\n\n")
+ return(matches)
+}
+
+df <- c(
+ "anfangs/i",
+ "zuguterletzt/i",
+ "zun\u00e4chst/i", # it is still necessary to encode non ascii characters in R package demos
+ "zuerst/i",
+ "zuletzt/i",
+ "schlie\u00dflich/i"
+) %>%
+ map(~ get_relative_positions_sample(kco, .)) %>%
+ bind_rows()
+
+hc_data <- df %>%
+ group_by(query) %>%
+ summarise(
+ min = min(relativeTextPosition),
+ q1 = quantile(relativeTextPosition, 0.25),
+ median = median(relativeTextPosition),
+ q3 = quantile(relativeTextPosition, 0.75),
+ max = max(relativeTextPosition)
+ ) %>%
+ mutate(data = pmap(list(min, q1, median, q3, max), c)) %>%
+ select(query, data)
+
+hc <- highchart() %>%
+ hc_chart(type = "boxplot", inverted = TRUE) %>%
+ hc_xAxis(categories = hc_data$query) %>%
+ hc_yAxis(ceiling = 1, title = list(text = "Relative position in text")) %>%
+ hc_add_series(data = hc_data$data) %>%
+ hc_title(text = "Relative positions of some adverbs in DeReKo texts") %>%
+ hc_legend(enabled = FALSE)
+
+print(hc)
diff --git a/tests/testthat/test-corpusQuery.R b/tests/testthat/test-corpusQuery.R
index f7ed2e0..b9d4b76 100644
--- a/tests/testthat/test-corpusQuery.R
+++ b/tests/testthat/test-corpusQuery.R
@@ -58,13 +58,16 @@
expect_gt(q@totalResults, 0)
})
-test_that("fetchAll fetches all results", {
+test_that("fetchAll fetches all results with match positions", {
skip_if_offline()
q <- new("KorAPConnection", verbose = TRUE) %>%
corpusQuery("Ameisenplage", vc = "pubDate since 2014")
expectedResults <- q@totalResults
matches <- fetchAll(q)@collectedMatches
expect_equal(nrow(matches), expectedResults)
+ expect_true(is.numeric(matches$matchEnd[1]))
+ expect_true(is.numeric(matches$matchStart[1]))
+ expect_equal(matches$matchStart[1], matches$matchEnd[1])
})
test_that("fetchAll fetches textClass metadatum", {