Add interface for collocationAnalysis method
Change-Id: I121bfdda28cfa8a7d77554133c78eba97bf89eb7
diff --git a/KorAPClient/__init__.py b/KorAPClient/__init__.py
index 1e0ba2c..aeb5864 100644
--- a/KorAPClient/__init__.py
+++ b/KorAPClient/__init__.py
@@ -1,14 +1,15 @@
__pdoc__ = {'tests': False}
-import rpy2.robjects.packages as packages
-from rpy2.robjects.conversion import localconverter
-import rpy2.robjects.pandas2ri as pandas2ri
-import rpy2.robjects as robjects
-from rpy2.robjects.methods import RS4
-from packaging import version
import warnings
-CURRENT_R_PACKAGE_VERSION = "0.6.1.9000"
+import rpy2.robjects as robjects
+import rpy2.robjects.packages as packages
+import rpy2.robjects.pandas2ri as pandas2ri
+from packaging import version
+from rpy2.robjects.conversion import localconverter
+from rpy2.robjects.methods import RS4
+
+CURRENT_R_PACKAGE_VERSION = "0.7.1"
KorAPClient = packages.importr('RKorAPClient')
if version.parse(KorAPClient.__version__) < version.parse(CURRENT_R_PACKAGE_VERSION):
@@ -123,6 +124,47 @@
return robjects.conversion.rpy2py(KorAPClient.collocationScoreQuery(self, node, collocate, vc, **kwargs))
+ def collocationAnalysis(self, node, vc="", **kwargs):
+ """ **EXPERIMENTAL**: Performs a collocation analysis for the given node (or query) in the given virtual corpus.
+
+ - **node** - target word or list of target words
+ - **vc** - string or list of strings describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
+ - **lemmatizeNodeQuery** - if True, node query will be lemmatized, i.e. x -> [tt/l=x]
+ - **minOccur** - minimum absolute number of observed co-occurrences to consider a collocate candidate
+ - **leftContextSize** - size of the left context window
+ - **rightContextSize** - size of the right context window
+ - **topCollocatesLimit** - limit analysis to the n most frequent collocates in the search hits sample
+ - **searchHitsSampleLimit** - limit the size of the search hits sample
+ - **ignoreCollocateCase** - bool, set to True if collocate case should be ignored
+ - **withinSpan** - KorAP span specification for collocations to be searched within
+ - **exactFrequencies** - if False, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
+ - **stopwords** - vector of stopwords not to be considered as collocates
+ - **seed** - seed for random page collecting order
+ - **expand** - if True, node and vc parameters are expanded to all of their combinations
+
+ Returns:
+ DataFrame with columns `'node', 'collocate', 'label', 'vc','webUIRequestUrl', 'w', 'leftContextSize',
+ 'rightContextSize', 'N', 'O', 'O1', 'O2', 'E', 'pmi', 'mi2', 'mi3', 'logDice', 'll'`
+
+ Details:
+ The collocation analysis is currently implemented on the client side, as some of the functionality is not yet provided by the KorAP backend. Mainly for this reason it is very slow (several minutes, up to hours), but on the other hand very flexible. You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries, and look for expression-internal collocates using the focus function (see examples and demo).
+ To increase speed at the cost of accuracy and possible false negatives, you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
+ Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one. This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web user interface.
+
+ Example:
+ ```
+ $ kcon = KorAPConnection(verbose=True)
+ $ df = kcon.collocationAnalysis("Grund")
+ ```
+ """
+ with localconverter(robjects.default_converter + pandas2ri.converter):
+ if type(node) is list:
+ node = robjects.StrVector(node)
+ if type(vc) is list:
+ vc = robjects.StrVector(vc)
+
+ return robjects.conversion.rpy2py(KorAPClient.collocationAnalysis(self, node, vc, **kwargs))
+
def corpusQuery(self, *args, **kwargs):
"""Query search term(s).
@@ -174,6 +216,7 @@
- **offset** - start offset for query results to fetch
- **maxFetch** - maximum number of query results to fetch
- **verbose**
+ - **randomizePageOrder** - fetch result pages in pseudo random order if true. (default = `False`)
Returns:
`KorAPQuery`
diff --git a/KorAPClient/tests/test_korapclient.py b/KorAPClient/tests/test_korapclient.py
index b359a19..0d19f8e 100644
--- a/KorAPClient/tests/test_korapclient.py
+++ b/KorAPClient/tests/test_korapclient.py
@@ -1,6 +1,8 @@
import unittest
+
from KorAPClient import KorAPConnection
+
class TestKorAPClient(unittest.TestCase):
def setUp(self):
self.kcon = KorAPConnection(verbose=True)
@@ -21,6 +23,13 @@
self.assertGreater(df['pmi'][0], 10)
self.assertLess(df['pmi'][0], 20)
+ def test_collocation_analysis(self):
+ df = self.kcon.collocationAnalysis( "focus([tt/p=ADJA] {Newstickeritis})", vc = "corpusSigle=/W.D17/", leftContextSize=1, rightContextSize=0,
+ searchHitsSampleLimit=1, topCollocatesLimit=1,
+ exactFrequencies=False)
+ self.assertEqual(df['rightContextSize'][0], 0)
+ self.assertGreater(df['O'][0], df['E'][0])
+
def test_collocation_score_query_multi_collocates(self):
df = self.kcon.collocationScoreQuery("Ameisenplage", ["einer", "heimgesucht"], leftContextSize=1, rightContextSize=1)
self.assertEqual(df['collocate'][1], 'heimgesucht')
diff --git a/Readme.md b/Readme.md
index da2981a..9eb6a32 100644
--- a/Readme.md
+++ b/Readme.md
@@ -79,6 +79,34 @@
```
![Frequency per million words of “Hello World“ in DE vs. AT from 2010 to 2018 in newspapers and magazines](figures/hello-world.png)
+### Identify *in … setzen* light verb constructions by the `collocationAnalysis` method
+[![Lifecycle:experimental](https://lifecycle.r-lib.org/articles/figures/lifecycle-experimental.svg)](https://www.tidyverse.org/lifecycle/#experimental)
+```python
+from KorAPClient import KorAPConnection
+
+kcon = KorAPConnection(verbose=True)
+results = kcon.collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
+ leftContextSize=1,
+ rightContextSize=0,
+ exactFrequencies=False,
+ searchHitsSampleLimit=1000,
+ topCollocatesLimit=20)
+results['collocate'] = "[" + results['collocate'] +"](" + results['webUIRequestUrl'] +")"
+print(results[['collocate', 'logDice', 'pmi', 'll']].head(10).round(2).to_markdown(floatfmt=".2f"))
+```
+| | collocate | logDice | pmi | ll |
+|---:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------|----------:|------:|----------:|
+| 1 | [Szene](https://korap.ids-mannheim.de/?q=Szene%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 10.37 | 11.54 | 824928.58 |
+| 2 | [Gang](https://korap.ids-mannheim.de/?q=Gang%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 9.65 | 10.99 | 366993.93 |
+| 3 | [Verbindung](https://korap.ids-mannheim.de/?q=Verbindung%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 9.20 | 10.34 | 347644.75 |
+| 4 | [Kenntnis](https://korap.ids-mannheim.de/?q=Kenntnis%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 9.15 | 10.67 | 206902.89 |
+| 5 | [Bewegung](https://korap.ids-mannheim.de/?q=Bewegung%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 8.80 | 9.91 | 264577.07 |
+| 6 | [Brand](https://korap.ids-mannheim.de/?q=Brand%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 8.76 | 9.97 | 210654.43 |
+| 7 | [Anführungszeichen](https://korap.ids-mannheim.de/?q=Anf%c3%bchrungszeichen%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 8.06 | 12.52 | 54148.31 |
+| 8 | [Kraft](https://korap.ids-mannheim.de/?q=Kraft%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 7.94 | 8.91 | 189399.70 |
+| 9 | [Beziehung](https://korap.ids-mannheim.de/?q=Beziehung%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 6.92 | 8.29 | 37723.54 |
+| 10 | [Relation](https://korap.ids-mannheim.de/?q=Relation%20focus%28in%20%5btt%2fp%3dNN%5d%20%7b%5btt%2fl%3dsetzen%5d%7d%29&ql=poliqarp) | 6.64 | 10.24 | 17105.84 |
+
## Command Line Invocation
The Python KorAP client can also be called from the command line and shell scripts:
```shell script
@@ -101,6 +129,7 @@
example:
python -m KorAPClient -v --query "Hello World" "Hallo Welt" --vc "pubDate in 2017" "pubDate in 2018" "pubDate in 2019"
```
+
### Accessed API Services
By using the KorAPClient you agree to the respective terms of use of the accessed KorAP API services which will be printed upon opening a connection.