| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 1 | __pdoc__ = { 'tests': False } |
| Marc Kupietz | ca08069 | 2020-06-19 18:06:46 +0200 | [diff] [blame] | 2 | import rpy2.robjects.packages as packages |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 3 | import rpy2.robjects.pandas2ri as pandas2ri |
| 4 | from rpy2.robjects.methods import RS4 |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 5 | from pandas import DataFrame |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 6 | |
| Marc Kupietz | ca08069 | 2020-06-19 18:06:46 +0200 | [diff] [blame] | 7 | KorAPClient = packages.importr('RKorAPClient') |
| 8 | pandas2ri.activate() |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 9 | |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 10 | # noinspection PyPep8Naming |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 11 | class KorAPConnection(RS4): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 12 | """Connection to a KorAP server.""" |
| 13 | |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 14 | def __init__(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 15 | """Constructor keyword arguments: |
| 16 | |
| 17 | - **KorAPUrl** (default = `"https://korap.ids-mannheim.de/"`) |
| 18 | - **apiVersion** (default = 'v1.0') |
| 19 | - **apiUrl** |
| 20 | - **accessToken** (default = `getAccessToken(KorAPUrl)` |
| 21 | - **userAgent** (default = `"Python-KorAP-Client"`) |
| 22 | - **timeout** (default = 110) |
| 23 | - **verbose** (default = False) |
| 24 | - **cache** (dafault = True) |
| 25 | """ |
| 26 | if 'userAgent' not in kwargs: |
| 27 | kwargs["userAgent"] = "Python-KorAP-Client" |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 28 | kco = KorAPClient.KorAPConnection(*args, **kwargs) |
| 29 | super().__init__(kco) |
| 30 | |
| 31 | def corpusStats(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 32 | """Query the size of the whole corpus or a virtual corpus specified by the vc argument. |
| 33 | |
| 34 | - vc = "" |
| 35 | - verbose = kco@verbose |
| 36 | - as.df = False |
| 37 | |
| 38 | Returns: |
| 39 | `DataFrame`|`RS4` |
| 40 | |
| 41 | Example: |
| 42 | ``` |
| 43 | $ df = kcon.corpusStats("pubDate in 2018 & textType=/Zeit.*/ & pubPlaceKey=IT", **{"as.df": True}) |
| 44 | $ df["tokens"] |
| 45 | 12150897 |
| 46 | ``` |
| 47 | """ |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 48 | return KorAPClient.corpusStats(self, *args, **kwargs) |
| 49 | |
| 50 | def frequencyQuery(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 51 | """Query relative frequency of search term(s). |
| 52 | |
| 53 | - **query** – query string or list of query strings |
| 54 | - **vc** - virtual corpus definition or list thereof (default: "") |
| 55 | - **conf.level** – confidence level of the returned confidence interval (default = 0.95) |
| 56 | - **as.alternatives** – decides whether queries should be treated as mutually exclusive and exhaustive wrt. to some meaningful class (e.g. spelling variants of a certain word form) (default = False) |
| 57 | - **KorAPUrl** – instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser |
| 58 | - **metadataOnly** – determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True) |
| 59 | - **ql** – query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`) |
| 60 | - **accessRewriteFatal** – abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented) (default = `True`) |
| 61 | - **verbose** – (default = `self.verbose`) |
| 62 | - **expand** – bool that decides if `query` and `vc` parameters are expanded to all of their combinations (default = `len(vc) != len(query)`) |
| 63 | |
| 64 | Returns: |
| 65 | DataFrame with columns `'query', 'totalResults', 'vc', 'webUIRequestUrl', 'total', 'f', |
| 66 | 'conf.low', 'conf.high'`. |
| 67 | |
| 68 | Example: |
| 69 | ``` |
| 70 | $ kcon = KorAPConnection(verbose=True) |
| 71 | $ kcon.frequencyQuery("Ameisenplage", vc=["pubDate in "+str(y) for y in range(2010,2015)]) |
| 72 | query totalResults ... conf.low conf.high |
| 73 | 1 Ameisenplage 3 ... 9.727696e-10 1.200289e-08 |
| 74 | 2 Ameisenplage 12 ... 3.838218e-09 1.275717e-08 |
| 75 | 3 Ameisenplage 5 ... 2.013352e-09 1.356500e-08 |
| 76 | 4 Ameisenplage 6 ... 2.691331e-09 1.519888e-08 |
| 77 | 5 Ameisenplage 3 ... 8.629463e-10 1.064780e-08 |
| 78 | ``` |
| 79 | """ |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 80 | return KorAPClient.frequencyQuery(self, *args, **kwargs) |
| 81 | |
| 82 | def corpusQuery(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 83 | """Query search term(s). |
| 84 | |
| 85 | - **query** – query string or list of query strings |
| 86 | - **vc** - virtual corpus definition or list thereof (default: "") |
| 87 | - **KorAPUrl** – instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser |
| 88 | - **metadataOnly** – determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True) |
| 89 | - **ql** – query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`) |
| 90 | - **fields** – (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass"]`) |
| 91 | - **verbose** – (default = `self.verbose`) |
| 92 | |
| 93 | Returns: |
| 94 | `KorAPQuery` | `pandas.DataFrame` |
| 95 | |
| 96 | Example: |
| 97 | ``` |
| 98 | $ kcon = KorAPConnection(verbose=True) |
| 99 | $ q = kcon.corpusQuery("Ameisenplage") |
| 100 | $ q = q.fetchAll() |
| 101 | $ q.slots['collectedMatches'] |
| 102 | corpusSigle ... textClass |
| 103 | 1 WPD17 ... NaN |
| 104 | 2 WPD17 ... NaN |
| 105 | 3 WPD17 ... NaN |
| 106 | 4 WPD17 ... NaN |
| 107 | 5 WPD17 ... NaN |
| 108 | .. ... ... ... |
| 109 | 126 Z83 ... freizeit-unterhaltung reisen |
| 110 | 127 MZE03 ... freizeit-unterhaltung reisen natur-umwelt wett... |
| 111 | 128 MZE03 ... freizeit-unterhaltung reisen staat-gesellschaf... |
| 112 | 129 MZE14 ... wissenschaft populaerwissenschaft freizeit-unt... |
| 113 | 130 MZE00 ... wissenschaft populaerwissenschaft |
| 114 | [130 rows x 6 columns] |
| 115 | ``` |
| 116 | """ |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 117 | return KorAPQuery(self, *args, **kwargs) |
| 118 | |
| 119 | |
| 120 | class KorAPQuery(RS4): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 121 | """Query to a KorAP server.""" |
| 122 | |
| 123 | |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 124 | def __init__(self, *args, **kwargs): |
| 125 | kco = KorAPClient.corpusQuery(*args, **kwargs) |
| 126 | super().__init__(kco) |
| 127 | |
| 128 | def fetchNext(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 129 | """Fetch next couple of query results |
| 130 | |
| 131 | - **offset** – start offset for query results to fetch |
| 132 | - **maxFetch** – maximum number of query results to fetch |
| 133 | - **verbose** |
| 134 | |
| 135 | Returns: |
| 136 | `KorAPQuery` |
| 137 | """ |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 138 | return KorAPClient.fetchNext(self, *args, **kwargs) |
| 139 | |
| 140 | def fetchRest(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 141 | """Fetch remaining query results |
| 142 | |
| 143 | - **verbose** |
| 144 | |
| 145 | Returns: |
| 146 | `KorAPQuery` |
| 147 | """ |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 148 | return KorAPClient.fetchRest(self, *args, **kwargs) |
| 149 | |
| 150 | def fetchAll(self, *args, **kwargs): |
| Marc Kupietz | 7494fad | 2020-06-26 22:44:53 +0200 | [diff] [blame] | 151 | """Fetch all query results |
| 152 | |
| 153 | - **verbose** |
| 154 | |
| 155 | Returns: |
| 156 | `KorAPQuery` |
| 157 | |
| 158 | Example: |
| 159 | See `KorAPConnection.corpusQuery`. |
| 160 | """ |
| Marc Kupietz | 6d53974 | 2020-06-25 18:33:20 +0200 | [diff] [blame] | 161 | return KorAPClient.fetchAll(self, *args, **kwargs) |