Expand or join lists and vectors when converting R data frames
The column with the list "tokens" containing the elements left, match,
right becomes three columns: tokens.left, tokens.match, tokens.right
Data frames with list values cannot be converted by rpy2 ou of the box.
Join vectors and lists in df cells with Tab
Pandas data frame cells cannot contain vectors.
E.g. a value in the tokens.right column could be
"abgerissen\tund\tan\tanderer\tStelle"
Resolves #45
Change-Id: I6b7d369689ee441de34b698c29eba62b1f54844b
diff --git a/KorAPClient/__init__.py b/KorAPClient/__init__.py
index 9d7393c..ba58971 100644
--- a/KorAPClient/__init__.py
+++ b/KorAPClient/__init__.py
@@ -4,9 +4,14 @@
from itertools import product
import pandas as pd
+from rpy2.rinterface_lib.sexp import StrSexpVector, NULLType
+from rpy2.robjects import numpy2ri
+from rpy2.robjects.conversion import localconverter, get_conversion
+
import rpy2.robjects as robjects
import rpy2.robjects.packages as packages
import rpy2.robjects.pandas2ri as pandas2ri
+from rpy2 import rinterface as ri
from packaging import version
from rpy2.robjects.methods import RS4
@@ -27,6 +32,47 @@
robjects.conversion.set_conversion(robjects.default_converter + pandas2ri.converter + korapclient_converter)
+fix_lists_in_dataframes = robjects.default_converter
+
+
+@fix_lists_in_dataframes.rpy2py.register(StrSexpVector)
+def to_str(obj):
+ for i in range(len(obj)):
+ obj[i] = str(obj[i])
+ return "\t".join(obj)
+
+def my_cv(obj, cv):
+ if isinstance(obj, ri.StrSexpVector):
+ for i in range(len(obj)):
+ obj[i] = str(obj[i])
+ return StrSexpVector((obj))
+ else:
+ return cv.rpy2py(obj)
+
+def toDataFrame(obj):
+ cv = get_conversion() # get the converter from current context
+ names = []
+ objects = []
+ for i in range(len(obj)):
+ if isinstance(obj[i], ri.ListSexpVector):
+ list_name = obj.names[i] + "." if not isinstance(obj.names, NULLType) else "l" + str(i) + "."
+ for j in range(len(obj[i])):
+ local_name = str(obj[i].names[j]) if not isinstance(obj[i].names, NULLType) else str(j)
+ names.append(list_name + local_name)
+ objects.append(obj[i][j])
+ else:
+ names.append(obj.names[i])
+ objects.append(obj[i])
+
+
+ return pd.DataFrame(
+ {str(k): my_cv(objects[i], cv) for i, k in enumerate(names)}
+ )
+
+# associate the converter with R data.frame class
+fix_lists_in_dataframes.rpy2py_nc_map[ri.ListSexpVector].update({"data.frame": toDataFrame})
+
+
def expand_grid(dictionary):
"""Create a pandas DataFrame from all combinations of inputs
@@ -256,7 +302,12 @@
Returns:
`KorAPQuery`
"""
- return KorAPClient.fetchNext(self, *args, **kwargs)
+
+ res = KorAPClient.fetchNext(self, *args, **kwargs)
+ with localconverter(fix_lists_in_dataframes):
+ df = res.slots['collectedMatches']
+ res.slots['collectedMatches'] = df
+ return res
def fetchRest(self, *args, **kwargs):
"""Fetch remaining query results
@@ -266,7 +317,11 @@
Returns:
`KorAPQuery`
"""
- return KorAPClient.fetchRest(self, *args, **kwargs)
+ res = KorAPClient.fetchRest(self, *args, **kwargs)
+ with localconverter(fix_lists_in_dataframes):
+ df = res.slots['collectedMatches']
+ res.slots['collectedMatches'] = df
+ return res
def fetchAll(self, *args, **kwargs):
"""Fetch all query results
@@ -279,4 +334,9 @@
Example:
See `KorAPConnection.corpusQuery`.
"""
- return KorAPClient.fetchAll(self, *args, **kwargs)
+ res = KorAPClient.fetchRest(self, *args, **kwargs)
+ with localconverter(fix_lists_in_dataframes):
+ df = res.slots['collectedMatches']
+ res.slots['collectedMatches'] = df
+ return res
+
diff --git a/KorAPClient/tests/test_korapclient.py b/KorAPClient/tests/test_korapclient.py
index 1e8c164..e5e9988 100644
--- a/KorAPClient/tests/test_korapclient.py
+++ b/KorAPClient/tests/test_korapclient.py
@@ -11,6 +11,36 @@
q = self.kcon.corpusQuery("Test")
self.assertEqual(q.slots['class'][0], 'KorAPQuery')
+ def test_query_with_snippets(self):
+ q = self.kcon.corpusQuery("Ameisenplage", metadataOnly=False).fetchNext()
+ self.assertIn('collectedMatches', q.slots)
+ self.assertIsInstance(q.slots['collectedMatches']['tokens.match'].iloc[0], str)
+
+ def test_query_with_snippets_is_tokenized_with_fetch_next(self):
+ q = self.kcon.corpusQuery("Ameisenplage", metadataOnly=False).fetchNext()
+ self.assertIsInstance(q.slots['collectedMatches']['tokens.left'].iloc[0], str)
+ self.assertIsInstance(q.slots['collectedMatches']['tokens.match'].iloc[0], str)
+ self.assertIsInstance(q.slots['collectedMatches']['tokens.right'].iloc[0], str)
+ left_contexts = "".join(q.slots['collectedMatches']['tokens.left'])
+ self.assertIn('\t', left_contexts)
+
+ def test_query_with_snippets_is_tokenized_with_fetch_all(self):
+ q = self.kcon.corpusQuery("Ameisenplage", metadataOnly=False).fetchAll()
+ self.assertIsInstance(q.slots['collectedMatches']['tokens.left'].iloc[0], str)
+ self.assertIsInstance(q.slots['collectedMatches']['tokens.match'].iloc[0], str)
+ self.assertIsInstance(q.slots['collectedMatches']['tokens.right'].iloc[0], str)
+ left_contexts = "".join(q.slots['collectedMatches']['tokens.left'])
+ self.assertIn('\t', left_contexts)
+
+ def test_query_with_snippets_is_tokenized_with_fetch_rest(self):
+ q = self.kcon.corpusQuery("Ameisenplage", metadataOnly=False).fetchRest()
+ self.assertIsInstance(q.slots['collectedMatches']['tokens.left'].iloc[0], str)
+ self.assertIsInstance(q.slots['collectedMatches']['tokens.match'].iloc[0], str)
+ self.assertIsInstance(q.slots['collectedMatches']['tokens.right'].iloc[0], str)
+ self.assertIsInstance(q.slots['collectedMatches']['textSigle'].iloc[1], str)
+ left_contexts = "".join(q.slots['collectedMatches']['tokens.left'])
+ self.assertIn('\t', left_contexts)
+
def test_frequency_query(self):
df = self.kcon.frequencyQuery("Ameisenplage")
self.assertGreater(df['totalResults'].iloc[0], 10)