update functions and tests to align with RKorAPClient 0.9.0
Change-Id: I0221c6cc0b9180bc83feb96651e0a5f204846451
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 13a764d..bab23fc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,15 @@
# Version history
+## 0.9.0
+
+- Updates recommended RKorAPClient version to 0.9.0
+- Added `matchStart` and `matchEnd` columns to corpusQuery results, containing the start and end positions of the match in the text
+- Added `mergeDuplicateCollocates` function to merge collocation analysis results for different context positions
+- Added a query column to collocation analysis results
+- Improved documentation for span parameter in `collocationAnalysis` functions
+- Updated `textMetadata` method to use new metadata fields API, if available, to retrieve custom metadata for a text based on its sigle
+- Added new unit tests to cover the new features and changes
+
## 0.8.1
- Updates recommended RKorAPClient version to 0.8.1
diff --git a/KorAPClient/__init__.py b/KorAPClient/__init__.py
index 9d46471..35c4c3a 100644
--- a/KorAPClient/__init__.py
+++ b/KorAPClient/__init__.py
@@ -15,7 +15,7 @@
from packaging import version
from rpy2.robjects.methods import RS4
-CURRENT_R_PACKAGE_VERSION = "0.8.1"
+CURRENT_R_PACKAGE_VERSION = "0.9.0"
KorAPClient = packages.importr('RKorAPClient')
if version.parse(KorAPClient.__version__) < version.parse(CURRENT_R_PACKAGE_VERSION):
@@ -206,7 +206,7 @@
- **topCollocatesLimit** - limit analysis to the n most frequent collocates in the search hits sample
- **searchHitsSampleLimit** - limit the size of the search hits sample
- **ignoreCollocateCase** - bool, set to True if collocate case should be ignored
- - **withinSpan** - KorAP span specification for collocations to be searched within
+ - **withinSpan** - KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`
- **exactFrequencies** - if False, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
- **stopwords** - vector of stopwords not to be considered as collocates
- **seed** - seed for random page collecting order
@@ -229,6 +229,11 @@
"""
return KorAPClient.collocationAnalysis(self, node, vc, **kwargs)
+ def mergeDuplicateCollocates(self, *args, **kwargs):
+ """Merge collocation analysis results for different context positions."""
+ return KorAPClient.mergeDuplicateCollocates(*args, **kwargs)
+
+
def corpusQuery(self, *args, **kwargs):
"""Query search term(s).
@@ -237,7 +242,7 @@
- **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser
- **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
- **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`)
- - **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass"]`)
+ - **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass", "matchStart", "matchEnd"]`)
- **verbose** - (default = `self.verbose`)
Returns:
diff --git a/KorAPClient/tests/test_korapclient.py b/KorAPClient/tests/test_korapclient.py
index eb44d1c..d4673b3 100644
--- a/KorAPClient/tests/test_korapclient.py
+++ b/KorAPClient/tests/test_korapclient.py
@@ -89,6 +89,52 @@
self.assertIn('creationDate', df.columns)
self.assertIn('pubPlace', df.columns)
self.assertIn('author', df.columns)
+
+ def test_corpus_query_token_api(self):
+ q = self.kcon.corpusQuery("focus([tt/p=ADJA] {Newstickeritis})", vc="corpusSigle=/W.D17/", metadataOnly=False)
+ q = q.fetchNext()
+ matches = q.slots['collectedMatches']
+
+ self.assertGreater(len(matches), 10)
+
+ unique_matches = matches['tokens.match'].unique()
+ self.assertEqual(len(unique_matches), 1)
+ self.assertEqual(unique_matches[0], "Newstickeritis")
+
+ left_contexts = matches['tokens.left']
+ self.assertTrue(any('reine' in context for context in left_contexts))
+
+ right_contexts = matches['tokens.right']
+ self.assertTrue(any('Begriff' in context for context in right_contexts))
+
+ def test_match_start_and_end(self):
+ q = self.kcon.corpusQuery("focus([tt/p=ADJA] {Newstickeritis})", vc="corpusSigle=/W.D17/", metadataOnly=False)
+ q = q.fetchNext()
+ matches = q.slots['collectedMatches']
+
+ self.assertGreater(matches['matchEnd'].max(), 1000)
+ self.assertTrue((matches['matchEnd'] == matches['matchStart']).all())
+
+ def test_extended_metadata_fields_ked(self):
+ kcon_ked = KorAPConnection(KorAPUrl="https://korap.ids-mannheim.de/instance/ked", verbose=True)
+ q = kcon_ked.corpusQuery(
+ "einfache",
+ fields=[
+ "textSigle", "pubDate", "pubPlace", "availability", "textClass",
+ "snippet", "tokens", "KED.cover1Herder", "KED.cover2Herder",
+ "KED.cover3Herder", "KED.cover4Herder", "KED.cover5Herder",
+ "KED.nPara", "KED.nPunct1kTks", "KED.nSent", "KED.nToks",
+ "KED.nToksSentMd", "KED.nTyps", "KED.rcpnt", "KED.rcpntLabel",
+ "KED.strtgy", "KED.strtgyLabel", "KED.topic", "KED.topicLabel",
+ "KED.txttyp", "KED.txttypLabel"
+ ]
+ ).fetchAll()
+ df = q.slots['collectedMatches']
+ self.assertGreater(len(df), 0)
+ self.assertGreater(min(df['KED.nToks'].astype(float)), 100)
+ self.assertGreater(min(df['KED.nSent'].astype(float)), 8)
+ self.assertGreater(min(df['KED.rcpnt'].str.len()), 5)
+
if __name__ == '__main__':
diff --git a/pyproject.toml b/pyproject.toml
index 9cc876d..c32b40c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "KorAPClient"
-version = "0.8.1"
+version = "0.9.0"
description = "Client package to access KorAP's web service API"
authors = [
{name = "Marc Kupietz",email = "kupietz@ids-mannheim.de"},