| Marc Kupietz | 3be8bd9 | 2020-07-31 10:16:54 +0200 | [diff] [blame] | 1 | __pdoc__ = {'tests': False} |
| 2 | |
| Marc Kupietz | 27a1a2a | 2021-03-07 22:04:03 +0100 | [diff] [blame] | 3 | import warnings |
| Marc Kupietz | 49ef012 | 2022-09-29 17:57:01 +0200 | [diff] [blame] | 4 | from itertools import product |
| Marc Kupietz | 27a1a2a | 2021-03-07 22:04:03 +0100 | [diff] [blame] | 5 | |
| Marc Kupietz | 49ef012 | 2022-09-29 17:57:01 +0200 | [diff] [blame] | 6 | import pandas as pd |
| Marc Kupietz | a25adf0 | 2024-01-26 18:33:43 +0100 | [diff] [blame] | 7 | from rpy2.rinterface_lib.sexp import StrSexpVector, NULLType |
| 8 | from rpy2.robjects import numpy2ri |
| 9 | from rpy2.robjects.conversion import localconverter, get_conversion |
| 10 | |
| Marc Kupietz | c18d598 | 2021-10-02 16:38:30 +0200 | [diff] [blame] | 11 | import rpy2.robjects as robjects |
| 12 | import rpy2.robjects.packages as packages |
| 13 | import rpy2.robjects.pandas2ri as pandas2ri |
| Marc Kupietz | a25adf0 | 2024-01-26 18:33:43 +0100 | [diff] [blame] | 14 | from rpy2 import rinterface as ri |
| Marc Kupietz | c18d598 | 2021-10-02 16:38:30 +0200 | [diff] [blame] | 15 | from packaging import version |
| Marc Kupietz | c18d598 | 2021-10-02 16:38:30 +0200 | [diff] [blame] | 16 | from rpy2.robjects.methods import RS4 |
| 17 | |
| „feldmueller“ | 3386c1f | 2025-01-20 17:06:29 +0100 | [diff] [blame^] | 18 | CURRENT_R_PACKAGE_VERSION = "0.9.0" |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 19 | |
| Marc Kupietz | ca08069 | 2020-06-19 18:06:46 +0200 | [diff] [blame] | 20 | KorAPClient = packages.importr('RKorAPClient') |
| Marc Kupietz | 27a1a2a | 2021-03-07 22:04:03 +0100 | [diff] [blame] | 21 | if version.parse(KorAPClient.__version__) < version.parse(CURRENT_R_PACKAGE_VERSION): |
| Marc Kupietz | 2fd46fd | 2021-10-02 17:24:01 +0200 | [diff] [blame] | 22 | warnings.warn("R-package RKorAPClient version " + KorAPClient.__version__ + " is outdated, please update.", |
| 23 | DeprecationWarning) |
| Marc Kupietz | 27a1a2a | 2021-03-07 22:04:03 +0100 | [diff] [blame] | 24 | |
| Marc Kupietz | 6c46d70 | 2022-09-08 13:29:28 +0200 | [diff] [blame] | 25 | korapclient_converter = robjects.conversion.Converter('base empty converter') |
| 26 | |
| 27 | |
| 28 | @korapclient_converter.py2rpy.register(list) |
| 29 | def _rpy2py_robject(listObject): |
| 30 | return robjects.StrVector(listObject) |
| 31 | |
| 32 | |
| 33 | robjects.conversion.set_conversion(robjects.default_converter + pandas2ri.converter + korapclient_converter) |
| 34 | |
| Marc Kupietz | a25adf0 | 2024-01-26 18:33:43 +0100 | [diff] [blame] | 35 | fix_lists_in_dataframes = robjects.default_converter |
| 36 | |
| 37 | |
| 38 | @fix_lists_in_dataframes.rpy2py.register(StrSexpVector) |
| 39 | def to_str(obj): |
| 40 | for i in range(len(obj)): |
| 41 | obj[i] = str(obj[i]) |
| 42 | return "\t".join(obj) |
| 43 | |
| 44 | def my_cv(obj, cv): |
| 45 | if isinstance(obj, ri.StrSexpVector): |
| 46 | for i in range(len(obj)): |
| 47 | obj[i] = str(obj[i]) |
| 48 | return StrSexpVector((obj)) |
| 49 | else: |
| 50 | return cv.rpy2py(obj) |
| 51 | |
| 52 | def toDataFrame(obj): |
| 53 | cv = get_conversion() # get the converter from current context |
| 54 | names = [] |
| 55 | objects = [] |
| 56 | for i in range(len(obj)): |
| 57 | if isinstance(obj[i], ri.ListSexpVector): |
| 58 | list_name = obj.names[i] + "." if not isinstance(obj.names, NULLType) else "l" + str(i) + "." |
| 59 | for j in range(len(obj[i])): |
| 60 | local_name = str(obj[i].names[j]) if not isinstance(obj[i].names, NULLType) else str(j) |
| 61 | names.append(list_name + local_name) |
| 62 | objects.append(obj[i][j]) |
| 63 | else: |
| 64 | names.append(obj.names[i]) |
| 65 | objects.append(obj[i]) |
| 66 | |
| 67 | |
| 68 | return pd.DataFrame( |
| 69 | {str(k): my_cv(objects[i], cv) for i, k in enumerate(names)} |
| 70 | ) |
| 71 | |
| 72 | # associate the converter with R data.frame class |
| 73 | fix_lists_in_dataframes.rpy2py_nc_map[ri.ListSexpVector].update({"data.frame": toDataFrame}) |
| 74 | |
| 75 | |
| Marc Kupietz | 2fd46fd | 2021-10-02 17:24:01 +0200 | [diff] [blame] | 76 | |
| Marc Kupietz | 49ef012 | 2022-09-29 17:57:01 +0200 | [diff] [blame] | 77 | def expand_grid(dictionary): |
| „feldmueller“ | a95e6b1 | 2024-01-25 17:12:09 +0100 | [diff] [blame] | 78 | """Create a pandas DataFrame from all combinations of inputs |
| Marc Kupietz | 49ef012 | 2022-09-29 17:57:01 +0200 | [diff] [blame] | 79 | |
| 80 | - **dictionary** - dict with variable names as keys and their values as vectors |
| 81 | |
| 82 | Returns: |
| 83 | DataFrame with column names as specified by the dictionary key and all combinations of the specified values |
| 84 | in the rows. |
| 85 | |
| 86 | Example: |
| 87 | ``` |
| 88 | $ df = expand_grid({"Year": range(2010, 2019), "Country": ["DE", "CH"] }) |
| 89 | |
| 90 | $ df["vc"] = "textType=/Zeit.*/ & pubPlaceKey = " + df.Country + " & pubDate in " + list(map(str, df.Year)) |
| 91 | ``` |
| 92 | """ |
| 93 | |
| 94 | return pd.DataFrame([row for row in product(*dictionary.values())], |
| 95 | columns=dictionary.keys()) |
| 96 | |
| 97 | |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 98 | # noinspection PyPep8Naming |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 99 | class KorAPConnection(RS4): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 100 | """Connection to a KorAP server.""" |
| 101 | |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 102 | def __init__(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 103 | """Constructor keyword arguments: |
| 104 | |
| 105 | - **KorAPUrl** (default = `"https://korap.ids-mannheim.de/"`) |
| 106 | - **apiVersion** (default = 'v1.0') |
| 107 | - **apiUrl** |
| 108 | - **accessToken** (default = `getAccessToken(KorAPUrl)` |
| 109 | - **userAgent** (default = `"Python-KorAP-Client"`) |
| 110 | - **timeout** (default = 110) |
| 111 | - **verbose** (default = False) |
| Marc Kupietz | 2fd46fd | 2021-10-02 17:24:01 +0200 | [diff] [blame] | 112 | - **cache** (default = True) |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 113 | """ |
| 114 | if 'userAgent' not in kwargs: |
| 115 | kwargs["userAgent"] = "Python-KorAP-Client" |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 116 | kco = KorAPClient.KorAPConnection(*args, **kwargs) |
| 117 | super().__init__(kco) |
| 118 | |
| 119 | def corpusStats(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 120 | """Query the size of the whole corpus or a virtual corpus specified by the vc argument. |
| 121 | |
| Ines Pisetta | 5bd4b9c | 2022-07-20 16:48:17 +0200 | [diff] [blame] | 122 | - **vc** (default = "") |
| 123 | - **verbose** (default = kco@verbose) |
| 124 | - **as.df** (default = True) |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 125 | |
| 126 | Returns: |
| 127 | `DataFrame`|`RS4` |
| 128 | |
| 129 | Example: |
| 130 | ``` |
| 131 | $ df = kcon.corpusStats("pubDate in 2018 & textType=/Zeit.*/ & pubPlaceKey=IT", **{"as.df": True}) |
| 132 | $ df["tokens"] |
| 133 | 12150897 |
| 134 | ``` |
| 135 | """ |
| Ines Pisetta | 5bd4b9c | 2022-07-20 16:48:17 +0200 | [diff] [blame] | 136 | default_kwargs = {"as.df": True} |
| 137 | default_kwargs.update(kwargs) |
| Marc Kupietz | 0d23fb5 | 2022-09-08 12:35:34 +0200 | [diff] [blame] | 138 | return KorAPClient.corpusStats(self, *args, **default_kwargs) |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 139 | |
| 140 | def frequencyQuery(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 141 | """Query relative frequency of search term(s). |
| 142 | |
| Marc Kupietz | 3be8bd9 | 2020-07-31 10:16:54 +0200 | [diff] [blame] | 143 | - **query** - query string or list of query strings |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 144 | - **vc** - virtual corpus definition or list thereof (default: "") |
| Marc Kupietz | 3be8bd9 | 2020-07-31 10:16:54 +0200 | [diff] [blame] | 145 | - **conf.level** - confidence level of the returned confidence interval (default = 0.95) |
| 146 | - **as.alternatives** - decides whether queries should be treated as mutually exclusive and exhaustive wrt. to some meaningful class (e.g. spelling variants of a certain word form) (default = False) |
| 147 | - **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser |
| 148 | - **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True) |
| 149 | - **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`) |
| 150 | - **accessRewriteFatal** - abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented) (default = `True`) |
| 151 | - **verbose** - (default = `self.verbose`) |
| 152 | - **expand** - bool that decides if `query` and `vc` parameters are expanded to all of their combinations (default = `len(vc) != len(query)`) |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 153 | |
| 154 | Returns: |
| 155 | DataFrame with columns `'query', 'totalResults', 'vc', 'webUIRequestUrl', 'total', 'f', |
| 156 | 'conf.low', 'conf.high'`. |
| 157 | |
| 158 | Example: |
| 159 | ``` |
| 160 | $ kcon = KorAPConnection(verbose=True) |
| 161 | $ kcon.frequencyQuery("Ameisenplage", vc=["pubDate in "+str(y) for y in range(2010,2015)]) |
| 162 | query totalResults ... conf.low conf.high |
| 163 | 1 Ameisenplage 3 ... 9.727696e-10 1.200289e-08 |
| 164 | 2 Ameisenplage 12 ... 3.838218e-09 1.275717e-08 |
| 165 | 3 Ameisenplage 5 ... 2.013352e-09 1.356500e-08 |
| 166 | 4 Ameisenplage 6 ... 2.691331e-09 1.519888e-08 |
| 167 | 5 Ameisenplage 3 ... 8.629463e-10 1.064780e-08 |
| 168 | ``` |
| 169 | """ |
| Marc Kupietz | 0d23fb5 | 2022-09-08 12:35:34 +0200 | [diff] [blame] | 170 | return KorAPClient.frequencyQuery(self, *args, **kwargs) |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 171 | |
| Marc Kupietz | 9429cf6 | 2021-05-04 08:07:24 +0200 | [diff] [blame] | 172 | def collocationScoreQuery(self, node, collocate, vc="", **kwargs): |
| Marc Kupietz | e002976 | 2021-05-01 19:40:47 +0200 | [diff] [blame] | 173 | """Get collocation scores for given node(s) and collocate(s). |
| Marc Kupietz | 27a1a2a | 2021-03-07 22:04:03 +0100 | [diff] [blame] | 174 | |
| Marc Kupietz | e002976 | 2021-05-01 19:40:47 +0200 | [diff] [blame] | 175 | - **node** - target word |
| 176 | - **collocate** - collocate of target word |
| Marc Kupietz | 27a1a2a | 2021-03-07 22:04:03 +0100 | [diff] [blame] | 177 | - **vc** - virtual corpus definition or list thereof (default: "") |
| Marc Kupietz | e002976 | 2021-05-01 19:40:47 +0200 | [diff] [blame] | 178 | - **lemmatizeNodeQuery** - logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x] |
| 179 | - **lemmatizeCollocateQuery** - logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x] |
| 180 | - **leftContextSize** - size of the left context window |
| 181 | - **rightContextSize** - size of the right context window |
| Marc Kupietz | a26176f | 2021-05-04 19:54:50 +0200 | [diff] [blame] | 182 | - **scoreFunctions** - named list of R (!) score functions of the form function(O1, O2, O, N, E, window_size), see e.g. KorAPClient.pmi |
| 183 | - **smoothingConstant** - smoothing constant will be added to all observed values |
| Marc Kupietz | 27a1a2a | 2021-03-07 22:04:03 +0100 | [diff] [blame] | 184 | |
| 185 | Returns: |
| Marc Kupietz | e002976 | 2021-05-01 19:40:47 +0200 | [diff] [blame] | 186 | DataFrame with columns `'node', 'collocate', 'label', 'vc','webUIRequestUrl', 'w', 'leftContextSize', |
| Marc Kupietz | aed5176 | 2021-06-18 17:42:29 +0200 | [diff] [blame] | 187 | 'rightContextSize', 'N', 'O', 'O1', 'O2', 'E', 'pmi', 'mi2', 'mi3', 'logDice', 'll'` |
| Marc Kupietz | 27a1a2a | 2021-03-07 22:04:03 +0100 | [diff] [blame] | 188 | |
| 189 | Example: |
| 190 | ``` |
| 191 | $ kcon = KorAPConnection(verbose=True) |
| Marc Kupietz | e002976 | 2021-05-01 19:40:47 +0200 | [diff] [blame] | 192 | $ df = kcon.collocationScoreQuery("Grund", "triftiger") |
| Marc Kupietz | 27a1a2a | 2021-03-07 22:04:03 +0100 | [diff] [blame] | 193 | ``` |
| 194 | """ |
| Marc Kupietz | 6c46d70 | 2022-09-08 13:29:28 +0200 | [diff] [blame] | 195 | return KorAPClient.collocationScoreQuery(self, node, collocate, vc, **kwargs) |
| Marc Kupietz | 27a1a2a | 2021-03-07 22:04:03 +0100 | [diff] [blame] | 196 | |
| Marc Kupietz | c18d598 | 2021-10-02 16:38:30 +0200 | [diff] [blame] | 197 | def collocationAnalysis(self, node, vc="", **kwargs): |
| 198 | """ **EXPERIMENTAL**: Performs a collocation analysis for the given node (or query) in the given virtual corpus. |
| 199 | |
| 200 | - **node** - target word or list of target words |
| 201 | - **vc** - string or list of strings describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible. |
| 202 | - **lemmatizeNodeQuery** - if True, node query will be lemmatized, i.e. x -> [tt/l=x] |
| 203 | - **minOccur** - minimum absolute number of observed co-occurrences to consider a collocate candidate |
| 204 | - **leftContextSize** - size of the left context window |
| 205 | - **rightContextSize** - size of the right context window |
| 206 | - **topCollocatesLimit** - limit analysis to the n most frequent collocates in the search hits sample |
| 207 | - **searchHitsSampleLimit** - limit the size of the search hits sample |
| 208 | - **ignoreCollocateCase** - bool, set to True if collocate case should be ignored |
| „feldmueller“ | 3386c1f | 2025-01-20 17:06:29 +0100 | [diff] [blame^] | 209 | - **withinSpan** - KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s` |
| Marc Kupietz | c18d598 | 2021-10-02 16:38:30 +0200 | [diff] [blame] | 210 | - **exactFrequencies** - if False, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies |
| 211 | - **stopwords** - vector of stopwords not to be considered as collocates |
| 212 | - **seed** - seed for random page collecting order |
| 213 | - **expand** - if True, node and vc parameters are expanded to all of their combinations |
| 214 | |
| 215 | Returns: |
| 216 | DataFrame with columns `'node', 'collocate', 'label', 'vc','webUIRequestUrl', 'w', 'leftContextSize', |
| 217 | 'rightContextSize', 'N', 'O', 'O1', 'O2', 'E', 'pmi', 'mi2', 'mi3', 'logDice', 'll'` |
| 218 | |
| 219 | Details: |
| 220 | The collocation analysis is currently implemented on the client side, as some of the functionality is not yet provided by the KorAP backend. Mainly for this reason it is very slow (several minutes, up to hours), but on the other hand very flexible. You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries, and look for expression-internal collocates using the focus function (see examples and demo). |
| 221 | To increase speed at the cost of accuracy and possible false negatives, you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE. |
| 222 | Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one. This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web user interface. |
| 223 | |
| 224 | Example: |
| 225 | ``` |
| 226 | $ kcon = KorAPConnection(verbose=True) |
| 227 | $ df = kcon.collocationAnalysis("Grund") |
| 228 | ``` |
| 229 | """ |
| Marc Kupietz | 6c46d70 | 2022-09-08 13:29:28 +0200 | [diff] [blame] | 230 | return KorAPClient.collocationAnalysis(self, node, vc, **kwargs) |
| Marc Kupietz | c18d598 | 2021-10-02 16:38:30 +0200 | [diff] [blame] | 231 | |
| „feldmueller“ | 3386c1f | 2025-01-20 17:06:29 +0100 | [diff] [blame^] | 232 | def mergeDuplicateCollocates(self, *args, **kwargs): |
| 233 | """Merge collocation analysis results for different context positions.""" |
| 234 | return KorAPClient.mergeDuplicateCollocates(*args, **kwargs) |
| 235 | |
| 236 | |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 237 | def corpusQuery(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 238 | """Query search term(s). |
| 239 | |
| Marc Kupietz | 3be8bd9 | 2020-07-31 10:16:54 +0200 | [diff] [blame] | 240 | - **query** - query string or list of query strings |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 241 | - **vc** - virtual corpus definition or list thereof (default: "") |
| Marc Kupietz | 3be8bd9 | 2020-07-31 10:16:54 +0200 | [diff] [blame] | 242 | - **KorAPUrl** - instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser |
| 243 | - **metadataOnly** - determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True) |
| 244 | - **ql** - query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`) |
| „feldmueller“ | 3386c1f | 2025-01-20 17:06:29 +0100 | [diff] [blame^] | 245 | - **fields** - (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass", "matchStart", "matchEnd"]`) |
| Marc Kupietz | 3be8bd9 | 2020-07-31 10:16:54 +0200 | [diff] [blame] | 246 | - **verbose** - (default = `self.verbose`) |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 247 | |
| 248 | Returns: |
| Ines Pisetta | 5bd4b9c | 2022-07-20 16:48:17 +0200 | [diff] [blame] | 249 | `KorAPQuery` |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 250 | |
| 251 | Example: |
| 252 | ``` |
| 253 | $ kcon = KorAPConnection(verbose=True) |
| 254 | $ q = kcon.corpusQuery("Ameisenplage") |
| 255 | $ q = q.fetchAll() |
| 256 | $ q.slots['collectedMatches'] |
| 257 | corpusSigle ... textClass |
| 258 | 1 WPD17 ... NaN |
| 259 | 2 WPD17 ... NaN |
| 260 | 3 WPD17 ... NaN |
| 261 | 4 WPD17 ... NaN |
| 262 | 5 WPD17 ... NaN |
| 263 | .. ... ... ... |
| 264 | 126 Z83 ... freizeit-unterhaltung reisen |
| 265 | 127 MZE03 ... freizeit-unterhaltung reisen natur-umwelt wett... |
| 266 | 128 MZE03 ... freizeit-unterhaltung reisen staat-gesellschaf... |
| 267 | 129 MZE14 ... wissenschaft populaerwissenschaft freizeit-unt... |
| 268 | 130 MZE00 ... wissenschaft populaerwissenschaft |
| 269 | [130 rows x 6 columns] |
| 270 | ``` |
| 271 | """ |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 272 | return KorAPQuery(self, *args, **kwargs) |
| „feldmueller“ | a95e6b1 | 2024-01-25 17:12:09 +0100 | [diff] [blame] | 273 | |
| 274 | def textMetadata(self, textSigle, **kwargs): |
| 275 | """ Retrieves metadata for a text, identified by its sigle (id) using the corresponding KorAP API |
| 276 | (see `Kustvakt Wiki https://github.com/KorAP/Kustvakt/wiki/Service:-Metadata-Retrieval`). |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 277 | |
| „feldmueller“ | a95e6b1 | 2024-01-25 17:12:09 +0100 | [diff] [blame] | 278 | - **textSigle** - unique text id (concatenation of corpus, document and text ids, separated by `/`, e.g. ) or list thereof |
| 279 | |
| 280 | Returns: |
| 281 | DataFrame with columns for each metadata property. In case of errors, such as non-existing texts/sigles, the tibble will also contain a column called `errors`. |
| 282 | If there are metadata columns you cannot make sense of, please ignore them. The function simply returns all the metadata it gets from the server. |
| 283 | |
| 284 | Example: |
| 285 | ``` |
| 286 | $ kcon = KorAPConnection(verbose=True) |
| 287 | $ kcon.textMetadata(["WUD17/A97/08542", "WUD17/B96/57558", "WUD17/A97/08541"]) |
| 288 | ``` |
| 289 | """ |
| 290 | return KorAPClient.textMetadata(self, textSigle, **kwargs) |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 291 | |
| 292 | class KorAPQuery(RS4): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 293 | """Query to a KorAP server.""" |
| 294 | |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 295 | def __init__(self, *args, **kwargs): |
| 296 | kco = KorAPClient.corpusQuery(*args, **kwargs) |
| 297 | super().__init__(kco) |
| 298 | |
| 299 | def fetchNext(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 300 | """Fetch next couple of query results |
| 301 | |
| Marc Kupietz | 3be8bd9 | 2020-07-31 10:16:54 +0200 | [diff] [blame] | 302 | - **offset** - start offset for query results to fetch |
| 303 | - **maxFetch** - maximum number of query results to fetch |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 304 | - **verbose** |
| Marc Kupietz | c18d598 | 2021-10-02 16:38:30 +0200 | [diff] [blame] | 305 | - **randomizePageOrder** - fetch result pages in pseudo random order if true. (default = `False`) |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 306 | |
| 307 | Returns: |
| 308 | `KorAPQuery` |
| 309 | """ |
| Marc Kupietz | a25adf0 | 2024-01-26 18:33:43 +0100 | [diff] [blame] | 310 | |
| 311 | res = KorAPClient.fetchNext(self, *args, **kwargs) |
| 312 | with localconverter(fix_lists_in_dataframes): |
| 313 | df = res.slots['collectedMatches'] |
| 314 | res.slots['collectedMatches'] = df |
| 315 | return res |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 316 | |
| 317 | def fetchRest(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 318 | """Fetch remaining query results |
| 319 | |
| 320 | - **verbose** |
| 321 | |
| 322 | Returns: |
| 323 | `KorAPQuery` |
| 324 | """ |
| Marc Kupietz | a25adf0 | 2024-01-26 18:33:43 +0100 | [diff] [blame] | 325 | res = KorAPClient.fetchRest(self, *args, **kwargs) |
| 326 | with localconverter(fix_lists_in_dataframes): |
| 327 | df = res.slots['collectedMatches'] |
| 328 | res.slots['collectedMatches'] = df |
| 329 | return res |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 330 | |
| 331 | def fetchAll(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 332 | """Fetch all query results |
| 333 | |
| 334 | - **verbose** |
| 335 | |
| 336 | Returns: |
| 337 | `KorAPQuery` |
| 338 | |
| 339 | Example: |
| 340 | See `KorAPConnection.corpusQuery`. |
| 341 | """ |
| Marc Kupietz | a25adf0 | 2024-01-26 18:33:43 +0100 | [diff] [blame] | 342 | res = KorAPClient.fetchRest(self, *args, **kwargs) |
| 343 | with localconverter(fix_lists_in_dataframes): |
| 344 | df = res.slots['collectedMatches'] |
| 345 | res.slots['collectedMatches'] = df |
| 346 | return res |
| 347 | |