Add matchStart and matchEnd columns to collectedMatches in corpusQuery result Resolves #22 Change-Id: I6af9de503e5911cbe5c566b0fae529cfba7b764c

commit: f48811229c96fa0dc4288d494dba917e9616437b [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Tue Dec 17 14:55:39 2024 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Wed Dec 18 15:55:37 2024 +0100
tree: 7b00feec712cda9f3c1ff8887fd80f1d3331615e
parent: e5374f2d084e63309105093994e9b8c83f9d3521 [diff]
diff --git a/NAMESPACE b/NAMESPACE
index 42fe68e..534fbd3 100644
--- a/NAMESPACE
+++ b/NAMESPACE

@@ -109,6 +109,7 @@
 importFrom(stringr,str_detect)
 importFrom(stringr,str_match)
 importFrom(stringr,str_split)
+importFrom(stringr,word)
 importFrom(tibble,add_column)
 importFrom(tibble,as_tibble)
 importFrom(tibble,enframe)

diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 7abed6a..432dea4 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R

@@ -276,7 +276,8 @@
 #' @aliases fetchNext
 #' @rdname KorAPQuery-class
 #' @importFrom dplyr rowwise mutate bind_rows select summarise n select
-#' @importFrom tibble enframe
+#' @importFrom tibble enframe add_column
+#' @importFrom stringr word
 #' @importFrom tidyr unnest unchop pivot_wider
 #' @importFrom purrr map
 #' @export
@@ -330,7 +331,15 @@
         currentMatches[, field] <- NA
       }
     }
-    currentMatches <- currentMatches %>% select(kqo@fields)
+    currentMatches <- currentMatches %>%
+      select(kqo@fields) %>%
+      mutate(
+        tmp_positions = gsub(".*-p(\\d+)-(\\d+)", "\\1 \\2", res$matches$matchID),
+        matchStart = as.integer(stringr::word(tmp_positions, 1)),
+        matchEnd = as.integer(stringr::word(tmp_positions, 2)) - 1
+      ) %>%
+      select(-tmp_positions)
+
     if (!is.list(collectedMatches)) {
       collectedMatches <- currentMatches
     } else {

diff --git a/demo/00Index b/demo/00Index
index 3e9fbf2..426b8b6 100644
--- a/demo/00Index
+++ b/demo/00Index

@@ -15,3 +15,4 @@
 collocation_score_by_country    Plot collocation scores depending on country of publication using ggplot2.
 pluralGenderVariants            Plot frequencies of the plural gender variants of a word in the core corpus of the Council for German Orthography (OKK) over time.
 adjectiveCollocates             Runtime OAuth2 browser flow demonstration using the httr2 package showing adjective collocates of 'Gendern'
+relativeTextpositionBoxplot     Plot the relative text positions of a some adverbs as highcharter boxplot.

diff --git a/demo/relativeTextpositionBoxplot.R b/demo/relativeTextpositionBoxplot.R
new file mode 100644
index 0000000..02823da
--- /dev/null
+++ b/demo/relativeTextpositionBoxplot.R

@@ -0,0 +1,57 @@
+library(RKorAPClient)
+library(highcharter)
+library(tidyverse)
+
+kco <- new("KorAPConnection", verbose = TRUE)
+
+set.seed(7)
+
+get_relative_positions_sample <- function(kco, query, sampleSize = 400) {
+  res <- corpusQuery(kco, query)
+  res <- fetchNext(res, maxFetch = sampleSize, randomizePageOrder = TRUE)
+  matches <- res@collectedMatches
+  matches <- matches %>%
+    mutate(
+      query = query,
+      vc = paste0('textSigle="', textSigle, '"'),
+      textSize = corpusStats(kco, vc, as.df = TRUE)$tokens,
+      relativeTextPosition = matchStart / textSize
+    )
+  cat("\n\n", query, ":\n")
+  print(summary(matches$relativeTextPosition))
+  cat("\n\n")
+  return(matches)
+}
+
+df <- c(
+  "anfangs/i",
+  "zuguterletzt/i",
+  "zun\u00e4chst/i", # it is still necessary to encode non ascii characters in R package demos
+  "zuerst/i",
+  "zuletzt/i",
+  "schlie\u00dflich/i"
+) %>%
+  map(~ get_relative_positions_sample(kco, .)) %>%
+  bind_rows()
+
+hc_data <- df %>%
+  group_by(query) %>%
+  summarise(
+    min = min(relativeTextPosition),
+    q1 = quantile(relativeTextPosition, 0.25),
+    median = median(relativeTextPosition),
+    q3 = quantile(relativeTextPosition, 0.75),
+    max = max(relativeTextPosition)
+  ) %>%
+  mutate(data = pmap(list(min, q1, median, q3, max), c)) %>%
+  select(query, data)
+
+hc <- highchart() %>%
+  hc_chart(type = "boxplot", inverted = TRUE) %>%
+  hc_xAxis(categories = hc_data$query) %>%
+  hc_yAxis(ceiling = 1, title = list(text = "Relative position in text")) %>%
+  hc_add_series(data = hc_data$data) %>%
+  hc_title(text = "Relative positions of some adverbs in DeReKo texts") %>%
+  hc_legend(enabled = FALSE)
+
+print(hc)

diff --git a/tests/testthat/test-corpusQuery.R b/tests/testthat/test-corpusQuery.R
index f7ed2e0..b9d4b76 100644
--- a/tests/testthat/test-corpusQuery.R
+++ b/tests/testthat/test-corpusQuery.R

@@ -58,13 +58,16 @@
   expect_gt(q@totalResults, 0)
 })
 
-test_that("fetchAll fetches all results", {
+test_that("fetchAll fetches all results with match positions", {
   skip_if_offline()
   q <- new("KorAPConnection", verbose = TRUE) %>%
     corpusQuery("Ameisenplage", vc = "pubDate since 2014")
   expectedResults <- q@totalResults
   matches <- fetchAll(q)@collectedMatches
   expect_equal(nrow(matches), expectedResults)
+  expect_true(is.numeric(matches$matchEnd[1]))
+  expect_true(is.numeric(matches$matchStart[1]))
+  expect_equal(matches$matchStart[1], matches$matchEnd[1])
 })
 
 test_that("fetchAll fetches textClass metadatum", {
commit	f48811229c96fa0dc4288d494dba917e9616437b	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Dec 17 14:55:39 2024 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Dec 18 15:55:37 2024 +0100
tree	7b00feec712cda9f3c1ff8887fd80f1d3331615e
parent	e5374f2d084e63309105093994e9b8c83f9d3521 [diff]