update functions and tests to align with RKorAPClient 0.9.0 Change-Id: I0221c6cc0b9180bc83feb96651e0a5f204846451

commit: 3386c1f1b6496e8d73960a636ede929aae336584 [log] [tgz]
author: „feldmueller“ <„feldmueller@posteo.de“> Mon Jan 20 17:06:29 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Tue Jan 21 11:07:49 2025 +0100
tree: 22a16e9fb2d5ec9157a2d462bfdd9612e9e3bf0a
parent: e8c7adc3edff6e98c0955931a4246a6bb800b68d [diff]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 13a764d..bab23fc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md

@@ -1,5 +1,15 @@
 # Version history
 
+## 0.9.0
+
+- Updates recommended RKorAPClient version to 0.9.0
+- Added `matchStart` and `matchEnd` columns to corpusQuery results, containing the start and end positions of the match in the text
+- Added `mergeDuplicateCollocates` function to merge collocation analysis results for different context positions
+- Added a query column to collocation analysis results
+- Improved documentation for span parameter in `collocationAnalysis` functions
+- Updated `textMetadata` method to use new metadata fields API, if available, to retrieve custom metadata for a text based on its sigle
+- Added new unit tests to cover the new features and changes
+
 ## 0.8.1
 
 - Updates recommended RKorAPClient version to 0.8.1

diff --git a/KorAPClient/__init__.py b/KorAPClient/__init__.py
index 9d46471..35c4c3a 100644
--- a/KorAPClient/__init__.py
+++ b/KorAPClient/__init__.py

@@ -15,7 +15,7 @@
 from packaging import version
 from rpy2.robjects.methods import RS4
 
-CURRENT_R_PACKAGE_VERSION = "0.8.1"
+CURRENT_R_PACKAGE_VERSION = "0.9.0"
 
 KorAPClient = packages.importr('RKorAPClient')
 if version.parse(KorAPClient.__version__) < version.parse(CURRENT_R_PACKAGE_VERSION):
@@ -206,7 +206,7 @@
         - **topCollocatesLimit** - limit analysis to the n most frequent collocates in the search hits sample
         - **searchHitsSampleLimit** - limit the size of the search hits sample
         - **ignoreCollocateCase** - bool, set to True if collocate case should be ignored
-        - **withinSpan** - KorAP span specification for collocations to be searched within
+        - **withinSpan** - KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`
         - **exactFrequencies** - if False, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
         - **stopwords** - vector of stopwords not to be considered as collocates
         - **seed** - seed for random page collecting order
@@ -229,6 +229,11 @@
         """
         return KorAPClient.collocationAnalysis(self, node, vc, **kwargs)
 
+    def mergeDuplicateCollocates(self, *args, **kwargs):
+        """Merge collocation analysis results for different context positions."""
+        return KorAPClient.mergeDuplicateCollocates(*args, **kwargs)
+
+
     def corpusQuery(self, *args, **kwargs):
         """Query search term(s).
 
@@ -237,7 +242,7 @@
         - **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser
         - **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
         - **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`)
-        - **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate",  "pubPlace", "availability", "textClass"]`)
+        - **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate",  "pubPlace", "availability", "textClass", "matchStart", "matchEnd"]`)
         - **verbose** - (default = `self.verbose`)
 
         Returns:

diff --git a/KorAPClient/tests/test_korapclient.py b/KorAPClient/tests/test_korapclient.py
index eb44d1c..d4673b3 100644
--- a/KorAPClient/tests/test_korapclient.py
+++ b/KorAPClient/tests/test_korapclient.py

@@ -89,6 +89,52 @@
         self.assertIn('creationDate', df.columns)
         self.assertIn('pubPlace', df.columns)
         self.assertIn('author', df.columns)
+    
+    def test_corpus_query_token_api(self):
+        q = self.kcon.corpusQuery("focus([tt/p=ADJA] {Newstickeritis})", vc="corpusSigle=/W.D17/", metadataOnly=False)
+        q = q.fetchNext()
+        matches = q.slots['collectedMatches']
+        
+        self.assertGreater(len(matches), 10)
+        
+        unique_matches = matches['tokens.match'].unique()
+        self.assertEqual(len(unique_matches), 1)
+        self.assertEqual(unique_matches[0], "Newstickeritis")
+        
+        left_contexts = matches['tokens.left']
+        self.assertTrue(any('reine' in context for context in left_contexts))
+        
+        right_contexts = matches['tokens.right']
+        self.assertTrue(any('Begriff' in context for context in right_contexts))
+    
+    def test_match_start_and_end(self):
+        q = self.kcon.corpusQuery("focus([tt/p=ADJA] {Newstickeritis})", vc="corpusSigle=/W.D17/", metadataOnly=False)
+        q = q.fetchNext()
+        matches = q.slots['collectedMatches']
+        
+        self.assertGreater(matches['matchEnd'].max(), 1000)
+        self.assertTrue((matches['matchEnd'] == matches['matchStart']).all())
+
+    def test_extended_metadata_fields_ked(self):
+        kcon_ked = KorAPConnection(KorAPUrl="https://korap.ids-mannheim.de/instance/ked", verbose=True)
+        q = kcon_ked.corpusQuery(
+            "einfache",
+            fields=[
+                "textSigle", "pubDate", "pubPlace", "availability", "textClass",
+                "snippet", "tokens", "KED.cover1Herder", "KED.cover2Herder",
+                "KED.cover3Herder", "KED.cover4Herder", "KED.cover5Herder",
+                "KED.nPara", "KED.nPunct1kTks", "KED.nSent", "KED.nToks",
+                "KED.nToksSentMd", "KED.nTyps", "KED.rcpnt", "KED.rcpntLabel",
+                "KED.strtgy", "KED.strtgyLabel", "KED.topic", "KED.topicLabel",
+                "KED.txttyp", "KED.txttypLabel"
+            ]
+        ).fetchAll()
+        df = q.slots['collectedMatches']
+        self.assertGreater(len(df), 0)
+        self.assertGreater(min(df['KED.nToks'].astype(float)), 100)
+        self.assertGreater(min(df['KED.nSent'].astype(float)), 8)
+        self.assertGreater(min(df['KED.rcpnt'].str.len()), 5)
+
 
 
 if __name__ == '__main__':

diff --git a/pyproject.toml b/pyproject.toml
index 9cc876d..c32b40c 100644
--- a/pyproject.toml
+++ b/pyproject.toml

@@ -1,6 +1,6 @@
 [project]
 name = "KorAPClient"
-version = "0.8.1"
+version = "0.9.0"
 description = "Client package to access KorAP's web service API"
 authors = [
     {name = "Marc Kupietz",email = "kupietz@ids-mannheim.de"},
commit	3386c1f1b6496e8d73960a636ede929aae336584	[log] [tgz]
author	„feldmueller“ <„feldmueller@posteo.de“>	Mon Jan 20 17:06:29 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Jan 21 11:07:49 2025 +0100
tree	22a16e9fb2d5ec9157a2d462bfdd9612e9e3bf0a
parent	e8c7adc3edff6e98c0955931a4246a6bb800b68d [diff]