Expand or join lists and vectors when converting R data frames

The column with the list "tokens" containing the elements left, match,
right becomes three columns: tokens.left, tokens.match, tokens.right

Data frames with list values cannot be converted by rpy2 ou of the box.

Join vectors and lists in df cells with Tab

Pandas data frame cells cannot contain vectors.

E.g. a value in the tokens.right column could be
"abgerissen\tund\tan\tanderer\tStelle"

Resolves #45

Change-Id: I6b7d369689ee441de34b698c29eba62b1f54844b
diff --git a/KorAPClient/__init__.py b/KorAPClient/__init__.py
index 9d7393c..ba58971 100644
--- a/KorAPClient/__init__.py
+++ b/KorAPClient/__init__.py
@@ -4,9 +4,14 @@
 from itertools import product
 
 import pandas as pd
+from rpy2.rinterface_lib.sexp import StrSexpVector, NULLType
+from rpy2.robjects import numpy2ri
+from rpy2.robjects.conversion import localconverter, get_conversion
+
 import rpy2.robjects as robjects
 import rpy2.robjects.packages as packages
 import rpy2.robjects.pandas2ri as pandas2ri
+from rpy2 import rinterface as ri
 from packaging import version
 from rpy2.robjects.methods import RS4
 
@@ -27,6 +32,47 @@
 
 robjects.conversion.set_conversion(robjects.default_converter + pandas2ri.converter + korapclient_converter)
 
+fix_lists_in_dataframes = robjects.default_converter
+
+
+@fix_lists_in_dataframes.rpy2py.register(StrSexpVector)
+def to_str(obj):
+    for i in range(len(obj)):
+        obj[i] = str(obj[i])
+    return "\t".join(obj)
+
+def my_cv(obj, cv):
+    if isinstance(obj, ri.StrSexpVector):
+        for i in range(len(obj)):
+            obj[i] = str(obj[i])
+        return StrSexpVector((obj))
+    else:
+        return cv.rpy2py(obj)
+
+def toDataFrame(obj):
+    cv = get_conversion() # get the converter from current context
+    names = []
+    objects = []
+    for i in range(len(obj)):
+        if isinstance(obj[i], ri.ListSexpVector):
+            list_name = obj.names[i] +  "." if not isinstance(obj.names, NULLType) else "l" + str(i) + "."
+            for j in range(len(obj[i])):
+                local_name = str(obj[i].names[j]) if not isinstance(obj[i].names, NULLType) else str(j)
+                names.append(list_name + local_name)
+                objects.append(obj[i][j])
+        else:
+            names.append(obj.names[i])
+            objects.append(obj[i])
+
+
+    return pd.DataFrame(
+        {str(k): my_cv(objects[i], cv) for i, k in enumerate(names)}
+    )
+
+# associate the converter with R data.frame class
+fix_lists_in_dataframes.rpy2py_nc_map[ri.ListSexpVector].update({"data.frame": toDataFrame})
+
+
 
 def expand_grid(dictionary):
     """Create a pandas DataFrame from all combinations of inputs
@@ -256,7 +302,12 @@
         Returns:
             `KorAPQuery`
         """
-        return KorAPClient.fetchNext(self, *args, **kwargs)
+
+        res = KorAPClient.fetchNext(self, *args, **kwargs)
+        with localconverter(fix_lists_in_dataframes):
+            df = res.slots['collectedMatches']
+        res.slots['collectedMatches'] = df
+        return res
 
     def fetchRest(self, *args, **kwargs):
         """Fetch remaining query results
@@ -266,7 +317,11 @@
         Returns:
             `KorAPQuery`
         """
-        return KorAPClient.fetchRest(self, *args, **kwargs)
+        res = KorAPClient.fetchRest(self, *args, **kwargs)
+        with localconverter(fix_lists_in_dataframes):
+            df = res.slots['collectedMatches']
+        res.slots['collectedMatches'] = df
+        return res
 
     def fetchAll(self, *args, **kwargs):
         """Fetch all query results
@@ -279,4 +334,9 @@
         Example:
             See `KorAPConnection.corpusQuery`.
         """
-        return KorAPClient.fetchAll(self, *args, **kwargs)
+        res = KorAPClient.fetchRest(self, *args, **kwargs)
+        with localconverter(fix_lists_in_dataframes):
+            df = res.slots['collectedMatches']
+        res.slots['collectedMatches'] = df
+        return res
+
diff --git a/KorAPClient/tests/test_korapclient.py b/KorAPClient/tests/test_korapclient.py
index 1e8c164..e5e9988 100644
--- a/KorAPClient/tests/test_korapclient.py
+++ b/KorAPClient/tests/test_korapclient.py
@@ -11,6 +11,36 @@
         q = self.kcon.corpusQuery("Test")
         self.assertEqual(q.slots['class'][0], 'KorAPQuery')
 
+    def test_query_with_snippets(self):
+        q = self.kcon.corpusQuery("Ameisenplage", metadataOnly=False).fetchNext()
+        self.assertIn('collectedMatches', q.slots)
+        self.assertIsInstance(q.slots['collectedMatches']['tokens.match'].iloc[0], str)
+
+    def test_query_with_snippets_is_tokenized_with_fetch_next(self):
+        q = self.kcon.corpusQuery("Ameisenplage", metadataOnly=False).fetchNext()
+        self.assertIsInstance(q.slots['collectedMatches']['tokens.left'].iloc[0], str)
+        self.assertIsInstance(q.slots['collectedMatches']['tokens.match'].iloc[0], str)
+        self.assertIsInstance(q.slots['collectedMatches']['tokens.right'].iloc[0], str)
+        left_contexts = "".join(q.slots['collectedMatches']['tokens.left'])
+        self.assertIn('\t', left_contexts)
+
+    def test_query_with_snippets_is_tokenized_with_fetch_all(self):
+        q = self.kcon.corpusQuery("Ameisenplage", metadataOnly=False).fetchAll()
+        self.assertIsInstance(q.slots['collectedMatches']['tokens.left'].iloc[0], str)
+        self.assertIsInstance(q.slots['collectedMatches']['tokens.match'].iloc[0], str)
+        self.assertIsInstance(q.slots['collectedMatches']['tokens.right'].iloc[0], str)
+        left_contexts = "".join(q.slots['collectedMatches']['tokens.left'])
+        self.assertIn('\t', left_contexts)
+
+    def test_query_with_snippets_is_tokenized_with_fetch_rest(self):
+        q = self.kcon.corpusQuery("Ameisenplage", metadataOnly=False).fetchRest()
+        self.assertIsInstance(q.slots['collectedMatches']['tokens.left'].iloc[0], str)
+        self.assertIsInstance(q.slots['collectedMatches']['tokens.match'].iloc[0], str)
+        self.assertIsInstance(q.slots['collectedMatches']['tokens.right'].iloc[0], str)
+        self.assertIsInstance(q.slots['collectedMatches']['textSigle'].iloc[1], str)
+        left_contexts = "".join(q.slots['collectedMatches']['tokens.left'])
+        self.assertIn('\t', left_contexts)
+
     def test_frequency_query(self):
         df = self.kcon.frequencyQuery("Ameisenplage")
         self.assertGreater(df['totalResults'].iloc[0], 10)