Merge "Do not allow repeated nodes in collocation analysis"
diff --git a/.github/workflows/check-linux.yaml b/.github/workflows/check-linux.yaml
deleted file mode 100644
index e10b3c4..0000000
--- a/.github/workflows/check-linux.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-on: [push, pull_request]
-
-name: check-linux
-
-jobs:
- R-CMD-check:
- runs-on: ${{ matrix.config.os }}
-
- name: ${{ matrix.config.os }} (${{ matrix.config.r }})
-
- strategy:
- fail-fast: false
- matrix:
- config:
- - { os: ubuntu-18.04, r: 'release', cran: "https://demo.rstudiopm.com/all/__linux__/bionic/latest", args: "--no-manual" }
-
- env:
- R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
- CRAN: ${{ matrix.config.cran }}
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
-
- steps:
- - uses: actions/checkout@v1
-
- - uses: r-lib/actions/setup-r@master
- with:
- r-version: ${{ matrix.config.r }}
-
- - uses: r-lib/actions/setup-pandoc@master
-
- - uses: r-lib/actions/setup-tinytex@master
- if: contains(matrix.config.args, 'no-manual') == false
-
- - name: Install system dependencies
- if: runner.os == 'Linux'
- run: |
- sudo apt-get update -y
- sudo apt-get install -y libglpk-dev libjq-dev libv8-dev libprotobuf-dev protobuf-compiler libudunits2-dev libgdal-dev
-
- - name: Cache R packages
- uses: actions/cache@v1
- with:
- path: ${{ env.R_LIBS_USER }}
- key: ${{ runner.os }}-r-${{ matrix.config.r }}-${{ hashFiles('DESCRIPTION') }}
-
- - name: Install dependencies
- run: Rscript -e "install.packages('remotes')" -e "remotes::install_deps(dependencies = TRUE)" -e "remotes::install_cran('rcmdcheck')" -e "install.packages('igraph')" -e "install.packages('covr')"
-
- - name: Check
- run: Rscript -e "rcmdcheck::rcmdcheck(args = c('${{ matrix.config.args }}', '--run-donttest'), error_on = 'warning', check_dir = 'check')"
-
- - name: Upload check results
- if: failure()
- uses: actions/upload-artifact@master
- with:
- name: ${{ runner.os }}-r${{ matrix.config.r }}-results
- path: check
-
- - name: Test coverage
- run: covr::codecov()
- shell: Rscript {0}
diff --git a/.github/workflows/check-mac.yaml b/.github/workflows/check-mac.yaml
deleted file mode 100644
index 45c505e..0000000
--- a/.github/workflows/check-mac.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-on: [push, pull_request]
-
-name: check-mac
-
-jobs:
- R-CMD-check:
- runs-on: ${{ matrix.config.os }}
-
- name: ${{ matrix.config.os }} (${{ matrix.config.r }})
-
- strategy:
- fail-fast: false
- matrix:
- config:
- - { os: macOS-latest, r: '4.0', args: "--no-manual" }
-
- env:
- R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
- CRAN: ${{ matrix.config.cran }}
-
- steps:
- - uses: actions/checkout@v1
-
- - uses: r-lib/actions/setup-r@master
- with:
- r-version: ${{ matrix.config.r }}
-
- - uses: r-lib/actions/setup-pandoc@master
-
- - uses: r-lib/actions/setup-tinytex@master
- if: contains(matrix.config.args, 'no-manual') == false
-
- - name: Install system dependencies
- if: runner.os == 'macOS' && contains(matrix.config.args, 'no-manual') == false
- run: |
- tlmgr install pdftexcmds
-
- - name: Cache R packages
- uses: actions/cache@v1
- with:
- path: ${{ env.R_LIBS_USER }}
- key: ${{ runner.os }}-r-${{ matrix.config.r }}-${{ hashFiles('DESCRIPTION') }}
-
- - name: Install dependencies
- run: Rscript -e "install.packages('remotes')" -e "remotes::install_deps(dependencies = TRUE)" -e "remotes::install_cran('rcmdcheck')" -e "install.packages('igraph')"
-
- - name: Check
- run: Rscript -e "rcmdcheck::rcmdcheck(args = '${{ matrix.config.args }}', error_on = 'warning', check_dir = 'check')"
-
- - name: Upload check results
- if: failure()
- uses: actions/upload-artifact@master
- with:
- name: ${{ runner.os }}-r${{ matrix.config.r }}-results
- path: check
diff --git a/.github/workflows/check-standard.yaml b/.github/workflows/check-standard.yaml
new file mode 100644
index 0000000..0528262
--- /dev/null
+++ b/.github/workflows/check-standard.yaml
@@ -0,0 +1,58 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/master/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+ push:
+ branches: [main, master]
+ pull_request:
+ branches: [main, master]
+
+name: R-CMD-check
+
+jobs:
+ R-CMD-check:
+ runs-on: ${{ matrix.config.os }}
+
+ name: ${{ matrix.config.os }} (${{ matrix.config.r }})
+
+ strategy:
+ fail-fast: false
+ matrix:
+ config:
+ - {os: macOS-latest, r: 'release'}
+ - {os: windows-latest, r: 'release'}
+ - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
+ - {os: ubuntu-latest, r: 'release'}
+ - {os: ubuntu-latest, r: 'oldrel-1'}
+
+ env:
+ GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+ R_KEEP_PKG_SOURCE: yes
+
+ steps:
+ - uses: actions/checkout@v2
+
+ - uses: r-lib/actions/setup-pandoc@v1
+
+ - uses: r-lib/actions/setup-r@v1
+ with:
+ r-version: ${{ matrix.config.r }}
+ http-user-agent: ${{ matrix.config.http-user-agent }}
+ use-public-rspm: true
+
+ - uses: r-lib/actions/setup-r-dependencies@v1
+ with:
+ extra-packages: rcmdcheck
+
+ - uses: r-lib/actions/check-r-package@v1
+
+ - name: Show testthat output
+ if: always()
+ run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true
+ shell: bash
+
+ - name: Upload check results
+ if: failure()
+ uses: actions/upload-artifact@main
+ with:
+ name: ${{ runner.os }}-r${{ matrix.config.r }}-results
+ path: check
diff --git a/.github/workflows/check-windows.yaml b/.github/workflows/check-windows.yaml
deleted file mode 100644
index 21bc5dc..0000000
--- a/.github/workflows/check-windows.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-on: [push, pull_request]
-
-name: check-windows
-
-jobs:
- R-CMD-check:
- runs-on: ${{ matrix.config.os }}
-
- name: ${{ matrix.config.os }} (${{ matrix.config.r }})
-
- strategy:
- fail-fast: false
- matrix:
- config:
- - { os: windows-latest, r: 'release', args: "--no-manual"}
- - { os: windows-latest, r: 'devel', args: "--no-manual"}
-
- env:
- R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
- CRAN: ${{ matrix.config.cran }}
-
- steps:
- - uses: actions/checkout@v1
-
- - uses: r-lib/actions/setup-r@master
- with:
- r-version: ${{ matrix.config.r }}
-
- - uses: r-lib/actions/setup-pandoc@master
-
- - uses: r-lib/actions/setup-tinytex@master
- if: contains(matrix.config.args, 'no-manual') == false
-
- - name: Cache R packages
- uses: actions/cache@v1
- with:
- path: ${{ env.R_LIBS_USER }}
- key: ${{ runner.os }}-r-${{ matrix.config.r }}-${{ hashFiles('DESCRIPTION') }}
-
- - name: Install dependencies
- run: Rscript -e "install.packages('remotes')" -e "remotes::install_deps(dependencies = TRUE)" -e "remotes::install_cran('rcmdcheck')" -e "install.packages('igraph')"
-
- - name: Check
- run: Rscript -e "rcmdcheck::rcmdcheck(args = '${{ matrix.config.args }}', error_on = 'note', check_dir = 'check')"
-
- - name: Upload check results
- if: failure()
- uses: actions/upload-artifact@master
- with:
- name: ${{ runner.os }}-r${{ matrix.config.r }}-results
- path: check
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
new file mode 100644
index 0000000..3c0da1c
--- /dev/null
+++ b/.github/workflows/test-coverage.yaml
@@ -0,0 +1,30 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/master/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+ push:
+ branches: [main, master]
+ pull_request:
+ branches: [main, master]
+
+name: test-coverage
+
+jobs:
+ test-coverage:
+ runs-on: ubuntu-latest
+ env:
+ GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+ steps:
+ - uses: actions/checkout@v2
+
+ - uses: r-lib/actions/setup-r@v1
+ with:
+ use-public-rspm: true
+
+ - uses: r-lib/actions/setup-r-dependencies@v1
+ with:
+ extra-packages: covr
+
+ - name: Test coverage
+ run: covr::codecov()
+ shell: Rscript {0}
diff --git a/.gitignore b/.gitignore
index 04ac529..305f4b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@
cache/
docs
*.log
+*_files/
+*.bak
diff --git a/DESCRIPTION b/DESCRIPTION
index cf7f8ce..257b420 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
Package: RKorAPClient
Type: Package
Title: 'KorAP' Web Service Client Package
-Version: 0.7.1
+Version: 0.7.1.9000
Authors@R:
c(person(given = "Marc",
family = "Kupietz",
diff --git a/NEWS.md b/NEWS.md
index 9e71a1d..bae0810 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,11 @@
+# RKorAPClient 0.7.1.9000 (unreleased development version)
+
+- new parameters added to `collocationAnalysis` function:
+ - `maxRecurse` - apply collocation analysis recursively `maxRecurse` times
+ - `addExamples` - If `TRUE`, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if 'node' is given as a lemma query.
+ - `thresholdScore` - association score function to use for computing the threshold that is applied for recursive collocation analysis calls
+ - `threshold`- minimum value of `thresholdScore` function call to apply collocation analysis recursively
+
# RKorAPClient 0.7.1 (CRAN release)
- documentation migrated to roxygen2md
- Some examples in the documentation are now wrapped with:
diff --git a/R/KorAPConnection.R b/R/KorAPConnection.R
index 243e179..e4a1ed2 100644
--- a/R/KorAPConnection.R
+++ b/R/KorAPConnection.R
@@ -62,7 +62,7 @@
#' @rdname KorAPConnection-class
#' @export
setMethod("initialize", "KorAPConnection",
- function(.Object, KorAPUrl = "https://korap.ids-mannheim.de/", apiVersion = 'v1.0', apiUrl, accessToken = getAccessToken(KorAPUrl), userAgent = "R-KorAP-Client", timeout=110, verbose = FALSE, cache = TRUE) {
+ function(.Object, KorAPUrl = "https://korap.ids-mannheim.de/", apiVersion = 'v1.0', apiUrl, accessToken = getAccessToken(KorAPUrl), userAgent = "R-KorAP-Client", timeout=240, verbose = FALSE, cache = TRUE) {
.Object <- callNextMethod()
m <- regexpr("https?://[^?]+", KorAPUrl, perl = TRUE)
.Object@KorAPUrl <- regmatches(KorAPUrl, m)
diff --git a/R/association-scores.R b/R/association-scores.R
index 526ebab..8c79295 100644
--- a/R/association-scores.R
+++ b/R/association-scores.R
@@ -107,7 +107,7 @@
#' Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
#'
#' Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
-#' Free PDF available from <http://purl.org/stefan.evert/PUB/Evert2004phd.pdf>
+#' Free PDF available from <https://purl.org/stefan.evert/PUB/Evert2004phd.pdf>
#'
ll <- function(O1, O2, O, N, E, window_size) {
r1 = as.double(O1) * window_size
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index eb691f8..a2d6c5d 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -34,6 +34,12 @@
#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
#' @param seed seed for random page collecting order
#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
+#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
+#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
+#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
+#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
+#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
+#' @param collocateFilterRegex allow only collocates matching the regular expression
#' @param ... more arguments will be passed to [collocationScoreQuery()]
#' @inheritParams collocationScoreQuery,KorAPConnection-method
#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
@@ -79,6 +85,12 @@
stopwords = append(RKorAPClient::synsemanticStopwords(), node),
seed = 7,
expand = length(vc) != length(node),
+ maxRecurse = 0,
+ addExamples = FALSE,
+ thresholdScore = "logDice",
+ threshold = 2.0,
+ localStopwords = c(),
+ collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
...) {
# https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
word <- frequency <- NULL
@@ -93,7 +105,7 @@
node <- lemmatizeWordQuery(node)
}
- if (length(node) > 1 || length(vc) > 1) {
+ result <- if (length(node) > 1 || length(vc) > 1) {
grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
purrr::pmap(grid, function(node, vc, ...)
collocationAnalysis(kco,
@@ -108,6 +120,8 @@
withinSpan = withinSpan,
exactFrequencies = exactFrequencies,
stopwords = stopwords,
+ addExamples = TRUE,
+ localStopwords = localStopwords,
seed = seed,
expand = expand,
...) ) %>%
@@ -123,7 +137,7 @@
rightContextSize = rightContextSize,
searchHitsSampleLimit = searchHitsSampleLimit,
ignoreCollocateCase = ignoreCollocateCase,
- stopwords = stopwords,
+ stopwords = append(stopwords, localStopwords),
...
)
@@ -149,9 +163,66 @@
tibble()
}
}
+ if (maxRecurse > 0 & length(result) > 0 && any(!!as.name(thresholdScore) >= threshold)) {
+ recurseWith <- result %>%
+ filter(!!as.name(thresholdScore) >= threshold)
+ result <- collocationAnalysis(
+ kco,
+ node = paste0("(", buildCollocationQuery(
+ removeWithinSpan(recurseWith$node, withinSpan),
+ recurseWith$collocate,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ withinSpan = ""
+ ), ")"),
+ vc = vc,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ withinSpan = withinSpan,
+ maxRecurse = maxRecurse - 1,
+ stopwords = stopwords,
+ localStopwords = recurseWith$collocate,
+ exactFrequencies = exactFrequencies,
+ searchHitsSampleLimit = searchHitsSampleLimit,
+ topCollocatesLimit = topCollocatesLimit,
+ addExamples = FALSE
+ ) %>%
+ bind_rows(result) %>%
+ filter(logDice >= 2) %>%
+ filter(.$O >= minOccur) %>%
+ dplyr::arrange(dplyr::desc(logDice))
+ }
+ if (addExamples && length(result) > 0) {
+ result$query <-buildCollocationQuery(
+ result$node,
+ result$collocate,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ withinSpan = ""
+ )
+ result$example <- findExample(
+ kco,
+ query = result$query,
+ vc = result$vc
+ )
+ }
+ result
}
)
+# #' @export
+removeWithinSpan <- function(query, withinSpan) {
+ if (withinSpan == "") {
+ return(query)
+ }
+ needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
+ res <- gsub(needle, '\\1', query)
+ needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
+ res <- gsub(needle, '\\1', res)
+ return(res)
+}
+
#' @importFrom magrittr debug_pipe
#' @importFrom stringr str_match str_split str_detect
#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
@@ -163,6 +234,7 @@
ignoreCollocateCase = FALSE,
stopwords = c(),
tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|")",
+ collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
verbose = TRUE) {
word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
@@ -171,12 +243,13 @@
if (length(snippet) < 1) {
dplyr::tibble(word=c(), frequency=c())
} else if (length(snippet) > 1) {
- log.info(verbose, paste("Joinging", length(snippet), "kwics\n"))
+ log.info(verbose, paste("Joining", length(snippet), "kwics\n"))
for (s in snippet) {
oldTable <- snippet2FreqTable(
s,
leftContextSize = leftContextSize,
rightContextSize = rightContextSize,
+ collocateFilterRegex = collocateFilterRegex,
oldTable = oldTable,
stopwords = stopwords
)
@@ -213,7 +286,7 @@
table(c(left, right)) %>%
dplyr::as_tibble(.name_repair = "minimal") %>%
dplyr::rename(word = 1, frequency = 2) %>%
- dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
+ dplyr::filter(str_detect(word, collocateFilterRegex)) %>%
dplyr::anti_join(stopwordsTable, by="word") %>%
dplyr::bind_rows(oldTable)
}
@@ -255,7 +328,9 @@
"dem",
"nicht",
"ein",
+ "Ein",
"eine",
+ "Eine",
"es",
"auch",
"an",
@@ -290,6 +365,35 @@
return(res)
}
+
+# #' @export
+findExample <-
+ function(kco,
+ query,
+ vc = "",
+ matchOnly = TRUE) {
+ out <- character(length = length(query))
+
+ if (length(vc) < length(query))
+ vc <- rep(vc, length(query))
+
+ for (i in seq_along(query)) {
+ q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
+ if (q@totalResults > 0) {
+ q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
+ example <- as.character((q@collectedMatches)$snippet[1])
+ out[i] <- if(matchOnly) {
+ gsub('.*<mark>(.+)</mark>.*', '\\1', example)
+ } else {
+ stringr::str_replace(example, '<[^>]*>', '')
+ }
+ } else {
+ out[i] = ""
+ }
+ }
+ out
+ }
+
collocatesQuery <-
function(kco,
query,
@@ -313,6 +417,7 @@
rightContextSize = rightContextSize,
ignoreCollocateCase = ignoreCollocateCase,
stopwords = stopwords,
+ ...,
verbose = kco@verbose) %>%
mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
filter(frequency >= minOccur)
diff --git a/R/collocationScoreQuery.R b/R/collocationScoreQuery.R
index 53d214a..478f03c 100644
--- a/R/collocationScoreQuery.R
+++ b/R/collocationScoreQuery.R
@@ -96,7 +96,7 @@
buildWebUIRequestUrl(
kco,
buildCollocationQuery(
- node,
+ removeWithinSpan(node, withinSpan),
collocate,
lemmatizeNodeQuery,
lemmatizeCollocateQuery,
@@ -120,6 +120,7 @@
})
+# #' @export
buildCollocationQuery <- function( node,
collocate,
lemmatizeNodeQuery = FALSE,
diff --git a/R/hc_freq_by_year_ci.R b/R/hc_freq_by_year_ci.R
index 08ae695..2b54cb3 100644
--- a/R/hc_freq_by_year_ci.R
+++ b/R/hc_freq_by_year_ci.R
@@ -21,15 +21,14 @@
#' @examples
#' \dontrun{
#'
-#' year <- c(1990:2018)}
-#' alternatives <- c("macht []{0,3} Sinn", "ergibt []{0,3} Sinn")}\dontshow{alternatives <- c("macht []{0,3} Sinn")}
+#' year <- c(1990:2018)
+#' alternatives <- c("macht []{0,3} Sinn", "ergibt []{0,3} Sinn")
#' new("KorAPConnection", verbose = TRUE) %>%
#' frequencyQuery(query = alternatives,
#' vc = paste("textType = /Zeit.*/ & pubDate in", year),
#' as.alternatives = TRUE) %>%
#' hc_freq_by_year_ci(as.alternatives = TRUE)
#'
-#' \dontrun{
#'
#' kco <- new("KorAPConnection", verbose = TRUE)
#' expand_grid(
diff --git a/Readme.md b/Readme.md
index f7b0832..0fb1fe8 100644
--- a/Readme.md
+++ b/Readme.md
@@ -4,13 +4,11 @@
[![CRAN downloads](http://cranlogs.r-pkg.org/badges/RKorAPClient?color=brightgreen)](http://www.r-pkg.org/pkg/RKorAPClient)
[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
[![Lifecycle:stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://www.tidyverse.org/lifecycle/#stable)
+[![R build status](https://github.com/KorAP/RKorAPClient/workflows/R-CMD-check/badge.svg)](https://github.com/KorAP/RKorAPClient/actions)
[![Codecov test coverage](https://codecov.io/gh/KorAP/RKorAPClient/branch/master/graph/badge.svg)](https://codecov.io/gh/KorAP/RKorAPClient?branch=master)
[![Last commit](https://img.shields.io/github/last-commit/KorAP/RKorAPClient.svg)](https://github.com/KorAP/RKorAPClient/issues)
[![GitHub closed issues](https://img.shields.io/github/issues-raw/KorAP/RKorAPClient.svg)](https://github.com/KorAP/RKorAPClient/issues)
[![GitHub issues](https://img.shields.io/github/issues-closed-raw/KorAP/RKorAPClient.svg)](https://github.com/KorAP/RKorAPClient/issues)
-[![check-windows](https://github.com/KorAP/RKorAPClient/workflows/check-windows/badge.svg)](https://github.com/KorAP/RKorAPClient/actions?workflow=check-windows)
-[![check-mac](https://github.com/KorAP/RKorAPClient/workflows/check-mac/badge.svg)](https://github.com/KorAP/RKorAPClient/actions?workflow=check-mac)
-[![check-linux](https://github.com/KorAP/RKorAPClient/workflows/check-linux/badge.svg)](https://github.com/KorAP/RKorAPClient/actions?workflow=check-linux)
[![Github Stars](https://img.shields.io/github/stars/KorAP/RKorAPClient.svg?style=social&label=Github)](https://github.com/KorAP/RKorAPClient)
## Description
@@ -104,8 +102,7 @@
The whole process is shown in this video:
-https://korap.github.io/RKorAPClient/man/figures/RKorAPClient-get-and-persists-accessToken.mp4
-
+https://user-images.githubusercontent.com/11092081/142769056-b389649b-eac4-435f-ac6d-1715474a5605.mp4
## Demos
@@ -158,6 +155,15 @@
devtools::install_git("https://korap.ids-mannheim.de/gerrit/KorAP/RKorAPClient")
remotes::install_git("https://korap.ids-mannheim.de/gerrit/KorAP/RKorAPClient")
```
+### Full installation videos
+
+## Mac
+
+https://user-images.githubusercontent.com/11092081/142773435-ea7ef92a-7ea4-4c6d-a252-950e486352f2.mp4
+
+## Ubuntu
+
+https://user-images.githubusercontent.com/11092081/142772382-1354b8db-551f-48de-a416-4fd59267662d.mp4
## Development and License
### RKorAPClient
diff --git a/demo/00Index b/demo/00Index
index c392b70..ad41293 100644
--- a/demo/00Index
+++ b/demo/00Index
@@ -9,3 +9,4 @@
displayKwics Display query results as KWICs via html
light-verb-construction-ca Collocation analysis to identify light verb constructions matching the pattern "in NN setzen", with result rendered as HTML DataTable
highcharter-example Visualize frequencies of optionally alternative terms over time with interactive HTML and JavaScript elements using the package highcharter as wrapper for Highcharts
+recursiveCA Show result dataframe of recursvie collocation analysis as pretty HTML tableb y Knitr via Markdown.
diff --git a/demo/Rmd/ca.Rmd b/demo/Rmd/ca.Rmd
new file mode 100644
index 0000000..a4016a5
--- /dev/null
+++ b/demo/Rmd/ca.Rmd
@@ -0,0 +1,45 @@
+---
+title: "Kookkurrenzanalyse zu aufmerksam"
+output:
+ html_document:
+ css: style.css
+ keep_md: yes
+ self_contained: false
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
+library(RKorAPClient)
+library(kableExtra)
+library(DT)
+library(tidyverse)
+kco <- new("KorAPConnection", verbose=T)
+if (!exists('ca')) {
+ca <- kco %>%
+ collocationAnalysis(
+ "aufmerksam",
+ leftContextSize = 2,
+ rightContextSize = 2,
+ exactFrequencies = TRUE,
+ searchHitsSampleLimit = 1000,
+ topCollocatesLimit = 10,
+ withinSpan = "",
+ maxRecurse=1
+ )
+}
+```
+
+```{r ca}
+ca %>%
+ mutate(Beispiel=sprintf('<a href="%s">%s</a>', webUIRequestUrl, example)) %>%
+ select(Beispiel, logDice, pmi, ll) %>%
+ head(50) %>%
+ datatable(escape = F,
+ extensions = c('Buttons'),
+ options = list(
+ buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
+ pageLength = 25,
+ dom = 'ftpB'
+ )) %>%
+ formatRound(columns=~logDice + pmi + ll,digits=2)
+```
diff --git a/demo/Rmd/style.css b/demo/Rmd/style.css
new file mode 100644
index 0000000..3eace4d
--- /dev/null
+++ b/demo/Rmd/style.css
@@ -0,0 +1,53 @@
+@import url('//code.cdn.mozilla.net/fonts/fira.css');
+@import url('//korap.ids-mannheim.de/font/libertinus.css');
+
+h1, h2, h3, h4, h5, h6 {
+ font-family: 'Fira Sans',sans-serif;
+ line-height: 1.2;
+ font-weight: 500;
+}
+
+.title, .subtitle {
+ text-transform: uppercase;
+ text-align: center;
+}
+
+.date, .author {
+ text-align: center;
+}
+
+body {
+ font-family: 'Fira Sans', sans-serif;
+ font-size: 18px;
+ font-weight: 400;
+ font-variant-ligatures: common-ligatures;
+ font-variant-numeric: tabular-nums;
+}
+
+th {
+ color: rgb(246, 168, 0);
+}
+
+p {
+ hyphens: auto;
+ text-align: justify;
+ overflow-wrap: break-word;
+}
+
+.footnotes {
+ font-family: 'Fira Sans Condensed', sans-serif;
+ font-weight: 400;
+ font-size: 14px;
+ line-height: 1.5;
+}
+
+#TOC > ul {
+ font-family: 'Fira Sans', sans-serif;
+}
+
+.caption {
+ font-family: 'Fira Sans Condensed', sans-serif;
+ font-weight: 400;
+ font-size: 16px;
+ text-align: center;
+}
diff --git a/demo/recursiveCA.R b/demo/recursiveCA.R
new file mode 100644
index 0000000..0f2d104
--- /dev/null
+++ b/demo/recursiveCA.R
@@ -0,0 +1,28 @@
+library(RKorAPClient)
+library(tidyverse)
+library(knitr)
+library(rmarkdown)
+
+nodeWordform <- 'aufmerksam'
+mdFile <- tempfile(nodeWordform, fileext = ".md")
+
+new("KorAPConnection", verbose = TRUE) %>%
+ collocationAnalysis(
+ nodeWordform,
+ leftContextSize = 2,
+ rightContextSize = 2,
+ exactFrequencies = TRUE,
+ searchHitsSampleLimit = 1000,
+ topCollocatesLimit = 10,
+ maxRecurse = 1,
+ addExamples = TRUE
+ ) %>%
+ mutate(LVC = sprintf("[%s](%s)", example, webUIRequestUrl)) %>%
+ { . ->> ca } %>%
+ select(LVC, logDice, pmi, ll) %>%
+ head(50) %>%
+ kable(format = "pipe", digits = 2) %>%
+ cat(file = mdFile, sep = "\n")
+
+rmarkdown::render(mdFile)
+browseURL(gsub("\\.md", ".html", mdFile))
diff --git a/man/KorAPConnection-class.Rd b/man/KorAPConnection-class.Rd
index ee18692..75a165e 100644
--- a/man/KorAPConnection-class.Rd
+++ b/man/KorAPConnection-class.Rd
@@ -23,7 +23,7 @@
apiUrl,
accessToken = getAccessToken(KorAPUrl),
userAgent = "R-KorAP-Client",
- timeout = 110,
+ timeout = 240,
verbose = FALSE,
cache = TRUE
)
diff --git a/man/RKorAPClient-package.Rd b/man/RKorAPClient-package.Rd
index e59738e..54822cd 100644
--- a/man/RKorAPClient-package.Rd
+++ b/man/RKorAPClient-package.Rd
@@ -8,14 +8,7 @@
\description{
\if{html}{\figure{logo.png}{options: align='right' alt='logo' width='120'}}
-A client package that makes the 'KorAP' web service API accessible from R.
- The corpus analysis platform 'KorAP' has been developed as a scientific tool to make
- potentially large, stratified and multiply annotated corpora, such as the 'German Reference Corpus DeReKo'
- or the 'Corpus of the Contemporary Romanian Language CoRoLa', accessible for linguists to let them verify
- hypotheses and to find interesting patterns in real language use.
- The 'RKorAPClient' package provides access to 'KorAP' and the corpora behind it for user-created R code,
- as a programmatic alternative to the 'KorAP' web user-interface.
- You can learn more about 'KorAP' and use it directly on 'DeReKo' at <https://korap.ids-mannheim.de/>.
+A client package that makes the 'KorAP' web service API accessible from R. The corpus analysis platform 'KorAP' has been developed as a scientific tool to make potentially large, stratified and multiply annotated corpora, such as the 'German Reference Corpus DeReKo' or the 'Corpus of the Contemporary Romanian Language CoRoLa', accessible for linguists to let them verify hypotheses and to find interesting patterns in real language use. The 'RKorAPClient' package provides access to 'KorAP' and the corpora behind it for user-created R code, as a programmatic alternative to the 'KorAP' web user-interface. You can learn more about 'KorAP' and use it directly on 'DeReKo' at <https://korap.ids-mannheim.de/>.
}
\references{
Kupietz, Marc / Diewald, Nils / Margaretha, Eliza (2020):
diff --git a/man/association-score-functions.Rd b/man/association-score-functions.Rd
index 914d96c..97454fc 100644
--- a/man/association-score-functions.Rd
+++ b/man/association-score-functions.Rd
@@ -78,7 +78,7 @@
Dunning, T. (1993): Accurate methods for the statistics of surprise and coincidence. Comput. Linguist. 19, 1 (March 1993), 61-74.
Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
-Free PDF available from \url{http://purl.org/stefan.evert/PUB/Evert2004phd.pdf}
+Free PDF available from \url{https://purl.org/stefan.evert/PUB/Evert2004phd.pdf}
}
\seealso{
Other collocation analysis functions:
diff --git a/man/collocationAnalysis-KorAPConnection-method.Rd b/man/collocationAnalysis-KorAPConnection-method.Rd
index bc720ca..2a646f8 100644
--- a/man/collocationAnalysis-KorAPConnection-method.Rd
+++ b/man/collocationAnalysis-KorAPConnection-method.Rd
@@ -21,6 +21,12 @@
stopwords = append(RKorAPClient::synsemanticStopwords(), node),
seed = 7,
expand = length(vc) != length(node),
+ maxRecurse = 0,
+ addExamples = FALSE,
+ thresholdScore = "logDice",
+ threshold = 2,
+ localStopwords = c(),
+ collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
...
)
}
@@ -55,6 +61,18 @@
\item{expand}{if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations}
+\item{maxRecurse}{apply collocation analysis recursively \code{maxRecurse} times}
+
+\item{addExamples}{If TRUE, examples for instances of collocations will be added in a column \code{example}. This makes a difference in particular if \code{node} is given as a lemma query.}
+
+\item{thresholdScore}{association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls}
+
+\item{threshold}{minimum value of \code{thresholdScore} function call to apply collocation analysis recursively}
+
+\item{localStopwords}{vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls}
+
+\item{collocateFilterRegex}{allow only collocates matching the regular expression}
+
\item{...}{more arguments will be passed to \code{\link[=collocationScoreQuery]{collocationScoreQuery()}}}
}
\value{
diff --git a/man/hc_freq_by_year_ci.Rd b/man/hc_freq_by_year_ci.Rd
index df2a1b7..e8e560e 100644
--- a/man/hc_freq_by_year_ci.Rd
+++ b/man/hc_freq_by_year_ci.Rd
@@ -30,6 +30,32 @@
\strong{Warning:} This function may be moved to a new package.
}
+\examples{
+\dontrun{
+
+year <- c(1990:2018)
+alternatives <- c("macht []{0,3} Sinn", "ergibt []{0,3} Sinn")
+new("KorAPConnection", verbose = TRUE) \%>\%
+ frequencyQuery(query = alternatives,
+ vc = paste("textType = /Zeit.*/ & pubDate in", year),
+ as.alternatives = TRUE) \%>\%
+ hc_freq_by_year_ci(as.alternatives = TRUE)
+
+
+kco <- new("KorAPConnection", verbose = TRUE)
+expand_grid(
+ condition = c("textDomain = /Wirtschaft.*/", "textDomain != /Wirtschaft.*/"),
+ year = (2005:2011)
+) \%>\%
+ cbind(frequencyQuery(
+ kco,
+ "[tt/l=Heuschrecke]",
+ paste0(.$condition, " & pubDate in ", .$year)
+ )) \%>\%
+ hc_freq_by_year_ci()
+}
+
+}
\seealso{
Other highcharter-helpers:
\code{\link{hc_add_onclick_korap_search}()}
diff --git a/tests/testthat/test-collocations.R b/tests/testthat/test-collocations.R
index b97424b..e2d1755 100644
--- a/tests/testthat/test-collocations.R
+++ b/tests/testthat/test-collocations.R
@@ -24,7 +24,8 @@
rightContextSize = 0,
searchHitsSampleLimit = 100,
topCollocatesLimit = 1,
- exactFrequencies = FALSE
+ exactFrequencies = FALSE,
+ maxRecurse = 2
),
"access token"
)
@@ -35,8 +36,14 @@
test_that("collocationAnalysis on unaccounted strings does not error out", {
kco <- new("KorAPConnection", accessToken = NULL, verbose = TRUE)
expect_warning(
- df <- collocationAnalysis(kco, "XXXXXXXXAmeisenplage"),
+ df <- collocationAnalysis(kco, "XXXXXXXXAmeisenplage", vc=c("corpusSigle=/WDD17/", "corpusSigle=/WUD17/"), maxRecurse = 2),
"access token"
)
testthat::expect_equal(nrow(df), 0)
})
+
+test_that("temoveWithinSpanWorks", {
+ expect_equal(
+ removeWithinSpan("contains(<base/s=s>, (machen []{0,1} aufmerksam | aufmerksam []{0,1} machen))", "base/s=s"),
+ "(machen []{0,1} aufmerksam | aufmerksam []{0,1} machen)")
+})
diff --git a/tests/testthat/test-misc.R b/tests/testthat/test-misc.R
index 197424b..de9b266 100644
--- a/tests/testthat/test-misc.R
+++ b/tests/testthat/test-misc.R
@@ -11,7 +11,8 @@
c(
"referTo=x & textClass = /natur.*/ & pubDate in 2013",
"referTo=x & textClass = /freizeit.*/ & pubDate in 2014"
- ), pubDateOnly = T
+ ),
+ pubDateOnly = T
),
c("2013", "2014"))
@@ -19,7 +20,8 @@
c(
"referTo=x & textClass = /natur.*/ & creationDate in 2013",
"referTo=x & textClass = /freizeit.*/ & creationDate in 2014"
- ), pubDateOnly = T
+ ),
+ pubDateOnly = T
),
c("2013", "2014"))
@@ -27,7 +29,198 @@
c(
"referTo=x & textClass = /natur.*/ & creationDate in 2013",
"referTo=x & textClass = /freizeit.*/ & creationDate in 2014"
- ), excludePubDate = T
+ ),
+ excludePubDate = T
),
c("/natur.*/", "/freizeit.*/"))
})
+
+test_that("geom_freq_by_year_ci works correctly", {
+ df <-
+ structure(
+ list(
+ condition = c(
+ "textDomain = /Wirtschaft.*/",
+ "textDomain = /Wirtschaft.*/",
+ "textDomain = /Wirtschaft.*/",
+ "textDomain = /Wirtschaft.*/",
+ "textDomain = /Wirtschaft.*/",
+ "textDomain = /Wirtschaft.*/",
+ "textDomain = /Wirtschaft.*/",
+ "textDomain != /Wirtschaft.*/",
+ "textDomain != /Wirtschaft.*/",
+ "textDomain != /Wirtschaft.*/",
+ "textDomain != /Wirtschaft.*/",
+ "textDomain != /Wirtschaft.*/",
+ "textDomain != /Wirtschaft.*/",
+ "textDomain != /Wirtschaft.*/"
+ ),
+ year = c(
+ 2005L,
+ 2006L,
+ 2007L,
+ 2008L,
+ 2009L,
+ 2010L,
+ 2011L,
+ 2005L,
+ 2006L,
+ 2007L,
+ 2008L,
+ 2009L,
+ 2010L,
+ 2011L
+ ),
+ query = c(
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]",
+ "[tt/l=Heuschrecke]"
+ ),
+ totalResults = c(
+ 531L,
+ 823L,
+ 1130L,
+ 496L,
+ 302L,
+ 159L,
+ 122L,
+ 2831L,
+ 2245L,
+ 2477L,
+ 2010L,
+ 1697L,
+ 1142L,
+ 1829L
+ ),
+ vc = c(
+ "textDomain = /Wirtschaft.*/ & pubDate in 2005",
+ "textDomain = /Wirtschaft.*/ & pubDate in 2006",
+ "textDomain = /Wirtschaft.*/ & pubDate in 2007",
+ "textDomain = /Wirtschaft.*/ & pubDate in 2008",
+ "textDomain = /Wirtschaft.*/ & pubDate in 2009",
+ "textDomain = /Wirtschaft.*/ & pubDate in 2010",
+ "textDomain = /Wirtschaft.*/ & pubDate in 2011",
+ "textDomain != /Wirtschaft.*/ & pubDate in 2005",
+ "textDomain != /Wirtschaft.*/ & pubDate in 2006",
+ "textDomain != /Wirtschaft.*/ & pubDate in 2007",
+ "textDomain != /Wirtschaft.*/ & pubDate in 2008",
+ "textDomain != /Wirtschaft.*/ & pubDate in 2009",
+ "textDomain != /Wirtschaft.*/ & pubDate in 2010",
+ "textDomain != /Wirtschaft.*/ & pubDate in 2011"
+ ),
+ webUIRequestUrl = c(
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202005&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202006&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202007&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202008&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202009&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202010&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202011&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%21%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202005&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%21%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202006&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%21%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202007&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%21%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202008&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%21%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202009&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%21%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202010&ql=poliqarp",
+ "https://korapt.ids-mannheim.de/?q=%5btt%2fl%3dHeuschrecke%5d&cq=textDomain%20%21%3d%20%2fWirtschaft.%2a%2f%20%26%20pubDate%20in%202011&ql=poliqarp"
+ ),
+ total = c(
+ 35980430L,
+ 43834111L,
+ 45318302L,
+ 48021215L,
+ 43445640L,
+ 37850216L,
+ 43208130L,
+ 734690498L,
+ 754436010L,
+ 837243512L,
+ 870913993L,
+ 840473763L,
+ 758631859L,
+ 1650860702L
+ ),
+ f = c(
+ 1.4758022625077e-05,
+ 1.87753322977167e-05,
+ 2.49347382874142e-05,
+ 1.0328768232957e-05,
+ 6.95121535785869e-06,
+ 4.20076862969553e-06,
+ 2.82354269902447e-06,
+ 3.85332328062857e-06,
+ 2.97573282590262e-06,
+ 2.9585179992413e-06,
+ 2.30792020355103e-06,
+ 2.0190993160128e-06,
+ 1.50534147287901e-06,
+ 1.10790692260358e-06
+ ),
+ conf.low = c(
+ 13.541726123006,
+ 17.5246639403598,
+ 23.5119781061303,
+ 9.44895282189122,
+ 6.19934083242504,
+ 3.58441307384462,
+ 2.35445121482762,
+ 3.71330289674541,
+ 2.85451040064217,
+ 2.84369529096211,
+ 2.20864478463198,
+ 1.92471184006826,
+ 1.41988948639017,
+ 1.05798679123886
+ ),
+ conf.high = c(
+ 16.0822620637798,
+ 20.1144124716816,
+ 26.4429033931224,
+ 11.2894924346856,
+ 7.79280444795191,
+ 4.92066150039394,
+ 3.38358495629102,
+ 3.9985973655165,
+ 3.10207445836027,
+ 3.07795241551153,
+ 2.41163153037033,
+ 2.11808565716224,
+ 1.59589532013765,
+ 1.1601678685439
+ ),
+ ipm = c(
+ 14.758022625077,
+ 18.7753322977167,
+ 24.9347382874142,
+ 10.328768232957,
+ 6.95121535785869,
+ 4.20076862969553,
+ 2.82354269902447,
+ 3.85332328062857,
+ 2.97573282590262,
+ 2.9585179992413,
+ 2.30792020355103,
+ 2.0190993160128,
+ 1.50534147287901,
+ 1.10790692260358
+ )
+ ),
+ class = "data.frame",
+ row.names = c(NA,-14L)
+ )
+ gpt <- df %>% ggplot(aes(year, ipm, fill = condition, color = condition)) +
+ geom_freq_by_year_ci()
+ expect_equal(gpt[["labels"]][["url"]], "webUIRequestUrl")
+ expect_equal(gpt[["data"]][["query"]][14], "[tt/l=Heuschrecke]")
+})