blob: 35c4c3a0140fa7a444786eb06606d2393aba67f3 [file] [log] [blame]
Marc Kupietz3be8bd92020-07-31 10:16:54 +02001__pdoc__ = {'tests': False}
2
Marc Kupietz27a1a2a2021-03-07 22:04:03 +01003import warnings
Marc Kupietz49ef0122022-09-29 17:57:01 +02004from itertools import product
Marc Kupietz27a1a2a2021-03-07 22:04:03 +01005
Marc Kupietz49ef0122022-09-29 17:57:01 +02006import pandas as pd
Marc Kupietza25adf02024-01-26 18:33:43 +01007from rpy2.rinterface_lib.sexp import StrSexpVector, NULLType
8from rpy2.robjects import numpy2ri
9from rpy2.robjects.conversion import localconverter, get_conversion
10
Marc Kupietzc18d5982021-10-02 16:38:30 +020011import rpy2.robjects as robjects
12import rpy2.robjects.packages as packages
13import rpy2.robjects.pandas2ri as pandas2ri
Marc Kupietza25adf02024-01-26 18:33:43 +010014from rpy2 import rinterface as ri
Marc Kupietzc18d5982021-10-02 16:38:30 +020015from packaging import version
Marc Kupietzc18d5982021-10-02 16:38:30 +020016from rpy2.robjects.methods import RS4
17
„feldmueller“3386c1f2025-01-20 17:06:29 +010018CURRENT_R_PACKAGE_VERSION = "0.9.0"
Marc Kupietz6d539742020-06-25 18:33:20 +020019
Marc Kupietzca080692020-06-19 18:06:46 +020020KorAPClient = packages.importr('RKorAPClient')
Marc Kupietz27a1a2a2021-03-07 22:04:03 +010021if version.parse(KorAPClient.__version__) < version.parse(CURRENT_R_PACKAGE_VERSION):
Marc Kupietz2fd46fd2021-10-02 17:24:01 +020022 warnings.warn("R-package RKorAPClient version " + KorAPClient.__version__ + " is outdated, please update.",
23 DeprecationWarning)
Marc Kupietz27a1a2a2021-03-07 22:04:03 +010024
Marc Kupietz6c46d702022-09-08 13:29:28 +020025korapclient_converter = robjects.conversion.Converter('base empty converter')
26
27
28@korapclient_converter.py2rpy.register(list)
29def _rpy2py_robject(listObject):
30 return robjects.StrVector(listObject)
31
32
33robjects.conversion.set_conversion(robjects.default_converter + pandas2ri.converter + korapclient_converter)
34
Marc Kupietza25adf02024-01-26 18:33:43 +010035fix_lists_in_dataframes = robjects.default_converter
36
37
38@fix_lists_in_dataframes.rpy2py.register(StrSexpVector)
39def to_str(obj):
40 for i in range(len(obj)):
41 obj[i] = str(obj[i])
42 return "\t".join(obj)
43
44def my_cv(obj, cv):
45 if isinstance(obj, ri.StrSexpVector):
46 for i in range(len(obj)):
47 obj[i] = str(obj[i])
48 return StrSexpVector((obj))
49 else:
50 return cv.rpy2py(obj)
51
52def toDataFrame(obj):
53 cv = get_conversion() # get the converter from current context
54 names = []
55 objects = []
56 for i in range(len(obj)):
57 if isinstance(obj[i], ri.ListSexpVector):
58 list_name = obj.names[i] + "." if not isinstance(obj.names, NULLType) else "l" + str(i) + "."
59 for j in range(len(obj[i])):
60 local_name = str(obj[i].names[j]) if not isinstance(obj[i].names, NULLType) else str(j)
61 names.append(list_name + local_name)
62 objects.append(obj[i][j])
63 else:
64 names.append(obj.names[i])
65 objects.append(obj[i])
66
67
68 return pd.DataFrame(
69 {str(k): my_cv(objects[i], cv) for i, k in enumerate(names)}
70 )
71
72# associate the converter with R data.frame class
73fix_lists_in_dataframes.rpy2py_nc_map[ri.ListSexpVector].update({"data.frame": toDataFrame})
74
75
Marc Kupietz2fd46fd2021-10-02 17:24:01 +020076
Marc Kupietz49ef0122022-09-29 17:57:01 +020077def expand_grid(dictionary):
„feldmueller“a95e6b12024-01-25 17:12:09 +010078 """Create a pandas DataFrame from all combinations of inputs
Marc Kupietz49ef0122022-09-29 17:57:01 +020079
80 - **dictionary** - dict with variable names as keys and their values as vectors
81
82 Returns:
83 DataFrame with column names as specified by the dictionary key and all combinations of the specified values
84 in the rows.
85
86 Example:
87 ```
88 $ df = expand_grid({"Year": range(2010, 2019), "Country": ["DE", "CH"] })
89
90 $ df["vc"] = "textType=/Zeit.*/ & pubPlaceKey = " + df.Country + " & pubDate in " + list(map(str, df.Year))
91 ```
92 """
93
94 return pd.DataFrame([row for row in product(*dictionary.values())],
95 columns=dictionary.keys())
96
97
Marc Kupietz7494fad2020-06-26 22:44:53 +020098# noinspection PyPep8Naming
Marc Kupietz6d539742020-06-25 18:33:20 +020099class KorAPConnection(RS4):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200100 """Connection to a KorAP server."""
101
Marc Kupietz6d539742020-06-25 18:33:20 +0200102 def __init__(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200103 """Constructor keyword arguments:
104
105 - **KorAPUrl** (default = `"https://korap.ids-mannheim.de/"`)
106 - **apiVersion** (default = 'v1.0')
107 - **apiUrl**
108 - **accessToken** (default = `getAccessToken(KorAPUrl)`
109 - **userAgent** (default = `"Python-KorAP-Client"`)
110 - **timeout** (default = 110)
111 - **verbose** (default = False)
Marc Kupietz2fd46fd2021-10-02 17:24:01 +0200112 - **cache** (default = True)
Marc Kupietz7494fad2020-06-26 22:44:53 +0200113 """
114 if 'userAgent' not in kwargs:
115 kwargs["userAgent"] = "Python-KorAP-Client"
Marc Kupietz6d539742020-06-25 18:33:20 +0200116 kco = KorAPClient.KorAPConnection(*args, **kwargs)
117 super().__init__(kco)
118
119 def corpusStats(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200120 """Query the size of the whole corpus or a virtual corpus specified by the vc argument.
121
Ines Pisetta5bd4b9c2022-07-20 16:48:17 +0200122 - **vc** (default = "")
123 - **verbose** (default = kco@verbose)
124 - **as.df** (default = True)
Marc Kupietz7494fad2020-06-26 22:44:53 +0200125
126 Returns:
127 `DataFrame`|`RS4`
128
129 Example:
130 ```
131 $ df = kcon.corpusStats("pubDate in 2018 & textType=/Zeit.*/ & pubPlaceKey=IT", **{"as.df": True})
132 $ df["tokens"]
133 12150897
134 ```
135 """
Ines Pisetta5bd4b9c2022-07-20 16:48:17 +0200136 default_kwargs = {"as.df": True}
137 default_kwargs.update(kwargs)
Marc Kupietz0d23fb52022-09-08 12:35:34 +0200138 return KorAPClient.corpusStats(self, *args, **default_kwargs)
Marc Kupietz6d539742020-06-25 18:33:20 +0200139
140 def frequencyQuery(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200141 """Query relative frequency of search term(s).
142
Marc Kupietz3be8bd92020-07-31 10:16:54 +0200143 - **query** - query string or list of query strings
Marc Kupietz7494fad2020-06-26 22:44:53 +0200144 - **vc** - virtual corpus definition or list thereof (default: "")
Marc Kupietz3be8bd92020-07-31 10:16:54 +0200145 - **conf.level** - confidence level of the returned confidence interval (default = 0.95)
146 - **as.alternatives** - decides whether queries should be treated as mutually exclusive and exhaustive wrt. to some meaningful class (e.g. spelling variants of a certain word form) (default = False)
147 - **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser
148 - **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
149 - **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`)
150 - **accessRewriteFatal** - abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented) (default = `True`)
151 - **verbose** - (default = `self.verbose`)
152 - **expand** - bool that decides if `query` and `vc` parameters are expanded to all of their combinations (default = `len(vc) != len(query)`)
Marc Kupietz7494fad2020-06-26 22:44:53 +0200153
154 Returns:
155 DataFrame with columns `'query', 'totalResults', 'vc', 'webUIRequestUrl', 'total', 'f',
156 'conf.low', 'conf.high'`.
157
158 Example:
159 ```
160 $ kcon = KorAPConnection(verbose=True)
161 $ kcon.frequencyQuery("Ameisenplage", vc=["pubDate in "+str(y) for y in range(2010,2015)])
162 query totalResults ... conf.low conf.high
163 1 Ameisenplage 3 ... 9.727696e-10 1.200289e-08
164 2 Ameisenplage 12 ... 3.838218e-09 1.275717e-08
165 3 Ameisenplage 5 ... 2.013352e-09 1.356500e-08
166 4 Ameisenplage 6 ... 2.691331e-09 1.519888e-08
167 5 Ameisenplage 3 ... 8.629463e-10 1.064780e-08
168 ```
169 """
Marc Kupietz0d23fb52022-09-08 12:35:34 +0200170 return KorAPClient.frequencyQuery(self, *args, **kwargs)
Marc Kupietz6d539742020-06-25 18:33:20 +0200171
Marc Kupietz9429cf62021-05-04 08:07:24 +0200172 def collocationScoreQuery(self, node, collocate, vc="", **kwargs):
Marc Kupietze0029762021-05-01 19:40:47 +0200173 """Get collocation scores for given node(s) and collocate(s).
Marc Kupietz27a1a2a2021-03-07 22:04:03 +0100174
Marc Kupietze0029762021-05-01 19:40:47 +0200175 - **node** - target word
176 - **collocate** - collocate of target word
Marc Kupietz27a1a2a2021-03-07 22:04:03 +0100177 - **vc** - virtual corpus definition or list thereof (default: "")
Marc Kupietze0029762021-05-01 19:40:47 +0200178 - **lemmatizeNodeQuery** - logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
179 - **lemmatizeCollocateQuery** - logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
180 - **leftContextSize** - size of the left context window
181 - **rightContextSize** - size of the right context window
Marc Kupietza26176f2021-05-04 19:54:50 +0200182 - **scoreFunctions** - named list of R (!) score functions of the form function(O1, O2, O, N, E, window_size), see e.g. KorAPClient.pmi
183 - **smoothingConstant** - smoothing constant will be added to all observed values
Marc Kupietz27a1a2a2021-03-07 22:04:03 +0100184
185 Returns:
Marc Kupietze0029762021-05-01 19:40:47 +0200186 DataFrame with columns `'node', 'collocate', 'label', 'vc','webUIRequestUrl', 'w', 'leftContextSize',
Marc Kupietzaed51762021-06-18 17:42:29 +0200187 'rightContextSize', 'N', 'O', 'O1', 'O2', 'E', 'pmi', 'mi2', 'mi3', 'logDice', 'll'`
Marc Kupietz27a1a2a2021-03-07 22:04:03 +0100188
189 Example:
190 ```
191 $ kcon = KorAPConnection(verbose=True)
Marc Kupietze0029762021-05-01 19:40:47 +0200192 $ df = kcon.collocationScoreQuery("Grund", "triftiger")
Marc Kupietz27a1a2a2021-03-07 22:04:03 +0100193 ```
194 """
Marc Kupietz6c46d702022-09-08 13:29:28 +0200195 return KorAPClient.collocationScoreQuery(self, node, collocate, vc, **kwargs)
Marc Kupietz27a1a2a2021-03-07 22:04:03 +0100196
Marc Kupietzc18d5982021-10-02 16:38:30 +0200197 def collocationAnalysis(self, node, vc="", **kwargs):
198 """ **EXPERIMENTAL**: Performs a collocation analysis for the given node (or query) in the given virtual corpus.
199
200 - **node** - target word or list of target words
201 - **vc** - string or list of strings describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
202 - **lemmatizeNodeQuery** - if True, node query will be lemmatized, i.e. x -> [tt/l=x]
203 - **minOccur** - minimum absolute number of observed co-occurrences to consider a collocate candidate
204 - **leftContextSize** - size of the left context window
205 - **rightContextSize** - size of the right context window
206 - **topCollocatesLimit** - limit analysis to the n most frequent collocates in the search hits sample
207 - **searchHitsSampleLimit** - limit the size of the search hits sample
208 - **ignoreCollocateCase** - bool, set to True if collocate case should be ignored
„feldmueller“3386c1f2025-01-20 17:06:29 +0100209 - **withinSpan** - KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`
Marc Kupietzc18d5982021-10-02 16:38:30 +0200210 - **exactFrequencies** - if False, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
211 - **stopwords** - vector of stopwords not to be considered as collocates
212 - **seed** - seed for random page collecting order
213 - **expand** - if True, node and vc parameters are expanded to all of their combinations
214
215 Returns:
216 DataFrame with columns `'node', 'collocate', 'label', 'vc','webUIRequestUrl', 'w', 'leftContextSize',
217 'rightContextSize', 'N', 'O', 'O1', 'O2', 'E', 'pmi', 'mi2', 'mi3', 'logDice', 'll'`
218
219 Details:
220 The collocation analysis is currently implemented on the client side, as some of the functionality is not yet provided by the KorAP backend. Mainly for this reason it is very slow (several minutes, up to hours), but on the other hand very flexible. You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries, and look for expression-internal collocates using the focus function (see examples and demo).
221 To increase speed at the cost of accuracy and possible false negatives, you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
222 Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one. This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web user interface.
223
224 Example:
225 ```
226 $ kcon = KorAPConnection(verbose=True)
227 $ df = kcon.collocationAnalysis("Grund")
228 ```
229 """
Marc Kupietz6c46d702022-09-08 13:29:28 +0200230 return KorAPClient.collocationAnalysis(self, node, vc, **kwargs)
Marc Kupietzc18d5982021-10-02 16:38:30 +0200231
„feldmueller“3386c1f2025-01-20 17:06:29 +0100232 def mergeDuplicateCollocates(self, *args, **kwargs):
233 """Merge collocation analysis results for different context positions."""
234 return KorAPClient.mergeDuplicateCollocates(*args, **kwargs)
235
236
Marc Kupietz6d539742020-06-25 18:33:20 +0200237 def corpusQuery(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200238 """Query search term(s).
239
Marc Kupietz3be8bd92020-07-31 10:16:54 +0200240 - **query** - query string or list of query strings
Marc Kupietz7494fad2020-06-26 22:44:53 +0200241 - **vc** - virtual corpus definition or list thereof (default: "")
Marc Kupietz3be8bd92020-07-31 10:16:54 +0200242 - **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser
243 - **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
244 - **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`)
„feldmueller“3386c1f2025-01-20 17:06:29 +0100245 - **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass", "matchStart", "matchEnd"]`)
Marc Kupietz3be8bd92020-07-31 10:16:54 +0200246 - **verbose** - (default = `self.verbose`)
Marc Kupietz7494fad2020-06-26 22:44:53 +0200247
248 Returns:
Ines Pisetta5bd4b9c2022-07-20 16:48:17 +0200249 `KorAPQuery`
Marc Kupietz7494fad2020-06-26 22:44:53 +0200250
251 Example:
252 ```
253 $ kcon = KorAPConnection(verbose=True)
254 $ q = kcon.corpusQuery("Ameisenplage")
255 $ q = q.fetchAll()
256 $ q.slots['collectedMatches']
257 corpusSigle ... textClass
258 1 WPD17 ... NaN
259 2 WPD17 ... NaN
260 3 WPD17 ... NaN
261 4 WPD17 ... NaN
262 5 WPD17 ... NaN
263 .. ... ... ...
264 126 Z83 ... freizeit-unterhaltung reisen
265 127 MZE03 ... freizeit-unterhaltung reisen natur-umwelt wett...
266 128 MZE03 ... freizeit-unterhaltung reisen staat-gesellschaf...
267 129 MZE14 ... wissenschaft populaerwissenschaft freizeit-unt...
268 130 MZE00 ... wissenschaft populaerwissenschaft
269 [130 rows x 6 columns]
270 ```
271 """
Marc Kupietz6d539742020-06-25 18:33:20 +0200272 return KorAPQuery(self, *args, **kwargs)
„feldmueller“a95e6b12024-01-25 17:12:09 +0100273
274 def textMetadata(self, textSigle, **kwargs):
275 """ Retrieves metadata for a text, identified by its sigle (id) using the corresponding KorAP API
276 (see `Kustvakt Wiki https://github.com/KorAP/Kustvakt/wiki/Service:-Metadata-Retrieval`).
Marc Kupietz6d539742020-06-25 18:33:20 +0200277
„feldmueller“a95e6b12024-01-25 17:12:09 +0100278 - **textSigle** - unique text id (concatenation of corpus, document and text ids, separated by `/`, e.g. ) or list thereof
279
280 Returns:
281 DataFrame with columns for each metadata property. In case of errors, such as non-existing texts/sigles, the tibble will also contain a column called `errors`.
282 If there are metadata columns you cannot make sense of, please ignore them. The function simply returns all the metadata it gets from the server.
283
284 Example:
285 ```
286 $ kcon = KorAPConnection(verbose=True)
287 $ kcon.textMetadata(["WUD17/A97/08542", "WUD17/B96/57558", "WUD17/A97/08541"])
288 ```
289 """
290 return KorAPClient.textMetadata(self, textSigle, **kwargs)
Marc Kupietz6d539742020-06-25 18:33:20 +0200291
292class KorAPQuery(RS4):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200293 """Query to a KorAP server."""
294
Marc Kupietz6d539742020-06-25 18:33:20 +0200295 def __init__(self, *args, **kwargs):
296 kco = KorAPClient.corpusQuery(*args, **kwargs)
297 super().__init__(kco)
298
299 def fetchNext(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200300 """Fetch next couple of query results
301
Marc Kupietz3be8bd92020-07-31 10:16:54 +0200302 - **offset** - start offset for query results to fetch
303 - **maxFetch** - maximum number of query results to fetch
Marc Kupietz7494fad2020-06-26 22:44:53 +0200304 - **verbose**
Marc Kupietzc18d5982021-10-02 16:38:30 +0200305 - **randomizePageOrder** - fetch result pages in pseudo random order if true. (default = `False`)
Marc Kupietz7494fad2020-06-26 22:44:53 +0200306
307 Returns:
308 `KorAPQuery`
309 """
Marc Kupietza25adf02024-01-26 18:33:43 +0100310
311 res = KorAPClient.fetchNext(self, *args, **kwargs)
312 with localconverter(fix_lists_in_dataframes):
313 df = res.slots['collectedMatches']
314 res.slots['collectedMatches'] = df
315 return res
Marc Kupietz6d539742020-06-25 18:33:20 +0200316
317 def fetchRest(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200318 """Fetch remaining query results
319
320 - **verbose**
321
322 Returns:
323 `KorAPQuery`
324 """
Marc Kupietza25adf02024-01-26 18:33:43 +0100325 res = KorAPClient.fetchRest(self, *args, **kwargs)
326 with localconverter(fix_lists_in_dataframes):
327 df = res.slots['collectedMatches']
328 res.slots['collectedMatches'] = df
329 return res
Marc Kupietz6d539742020-06-25 18:33:20 +0200330
331 def fetchAll(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200332 """Fetch all query results
333
334 - **verbose**
335
336 Returns:
337 `KorAPQuery`
338
339 Example:
340 See `KorAPConnection.corpusQuery`.
341 """
Marc Kupietza25adf02024-01-26 18:33:43 +0100342 res = KorAPClient.fetchRest(self, *args, **kwargs)
343 with localconverter(fix_lists_in_dataframes):
344 df = res.slots['collectedMatches']
345 res.slots['collectedMatches'] = df
346 return res
347