blob: d329bfb24fa57cdab8ec663550c38e396b8e0e45 [file] [log] [blame]
Marc Kupietz7494fad2020-06-26 22:44:53 +02001__pdoc__ = { 'tests': False }
Marc Kupietzca080692020-06-19 18:06:46 +02002import rpy2.robjects.packages as packages
Marc Kupietz6d539742020-06-25 18:33:20 +02003import rpy2.robjects.pandas2ri as pandas2ri
4from rpy2.robjects.methods import RS4
Marc Kupietz7494fad2020-06-26 22:44:53 +02005from pandas import DataFrame
Marc Kupietz6d539742020-06-25 18:33:20 +02006
Marc Kupietzca080692020-06-19 18:06:46 +02007KorAPClient = packages.importr('RKorAPClient')
8pandas2ri.activate()
Marc Kupietz6d539742020-06-25 18:33:20 +02009
Marc Kupietz7494fad2020-06-26 22:44:53 +020010# noinspection PyPep8Naming
Marc Kupietz6d539742020-06-25 18:33:20 +020011class KorAPConnection(RS4):
Marc Kupietz7494fad2020-06-26 22:44:53 +020012 """Connection to a KorAP server."""
13
Marc Kupietz6d539742020-06-25 18:33:20 +020014 def __init__(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +020015 """Constructor keyword arguments:
16
17 - **KorAPUrl** (default = `"https://korap.ids-mannheim.de/"`)
18 - **apiVersion** (default = 'v1.0')
19 - **apiUrl**
20 - **accessToken** (default = `getAccessToken(KorAPUrl)`
21 - **userAgent** (default = `"Python-KorAP-Client"`)
22 - **timeout** (default = 110)
23 - **verbose** (default = False)
24 - **cache** (dafault = True)
25 """
26 if 'userAgent' not in kwargs:
27 kwargs["userAgent"] = "Python-KorAP-Client"
Marc Kupietz6d539742020-06-25 18:33:20 +020028 kco = KorAPClient.KorAPConnection(*args, **kwargs)
29 super().__init__(kco)
30
31 def corpusStats(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +020032 """Query the size of the whole corpus or a virtual corpus specified by the vc argument.
33
34 - vc = ""
35 - verbose = kco@verbose
36 - as.df = False
37
38 Returns:
39 `DataFrame`|`RS4`
40
41 Example:
42 ```
43 $ df = kcon.corpusStats("pubDate in 2018 & textType=/Zeit.*/ & pubPlaceKey=IT", **{"as.df": True})
44 $ df["tokens"]
45 12150897
46 ```
47 """
Marc Kupietz6d539742020-06-25 18:33:20 +020048 return KorAPClient.corpusStats(self, *args, **kwargs)
49
50 def frequencyQuery(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +020051 """Query relative frequency of search term(s).
52
53 - **query** – query string or list of query strings
54 - **vc** - virtual corpus definition or list thereof (default: "")
55 - **conf.level** – confidence level of the returned confidence interval (default = 0.95)
56 - **as.alternatives** – decides whether queries should be treated as mutually exclusive and exhaustive wrt. to some meaningful class (e.g. spelling variants of a certain word form) (default = False)
57 - **KorAPUrl** – instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser
58 - **metadataOnly** – determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
59 - **ql** – query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`)
60 - **accessRewriteFatal** – abort if query or given vc had to be rewritten due to insufficient rights (not yet implemented) (default = `True`)
61 - **verbose** – (default = `self.verbose`)
62 - **expand** – bool that decides if `query` and `vc` parameters are expanded to all of their combinations (default = `len(vc) != len(query)`)
63
64 Returns:
65 DataFrame with columns `'query', 'totalResults', 'vc', 'webUIRequestUrl', 'total', 'f',
66 'conf.low', 'conf.high'`.
67
68 Example:
69 ```
70 $ kcon = KorAPConnection(verbose=True)
71 $ kcon.frequencyQuery("Ameisenplage", vc=["pubDate in "+str(y) for y in range(2010,2015)])
72 query totalResults ... conf.low conf.high
73 1 Ameisenplage 3 ... 9.727696e-10 1.200289e-08
74 2 Ameisenplage 12 ... 3.838218e-09 1.275717e-08
75 3 Ameisenplage 5 ... 2.013352e-09 1.356500e-08
76 4 Ameisenplage 6 ... 2.691331e-09 1.519888e-08
77 5 Ameisenplage 3 ... 8.629463e-10 1.064780e-08
78 ```
79 """
Marc Kupietz6d539742020-06-25 18:33:20 +020080 return KorAPClient.frequencyQuery(self, *args, **kwargs)
81
82 def corpusQuery(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +020083 """Query search term(s).
84
85 - **query** – query string or list of query strings
86 - **vc** - virtual corpus definition or list thereof (default: "")
87 - **KorAPUrl** – instead of specifying the `query` and `vc` string parameters, you can copy your KorAP query URL here from the browser
88 - **metadataOnly** – determines whether queries should return only metadata without any snippets. This can also be useful to prevent access rewrites. (default = True)
89 - **ql** – query language: `"poliqarp" | "cosmas2" | "annis" | "cql" | "fcsql"` (default = `"poliqarp"`)
90 - **fields** – (meta)data fields that will be fetched for every match (default = `["corpusSigle", "textSigle", "pubDate", "pubPlace", "availability", "textClass"]`)
91 - **verbose** – (default = `self.verbose`)
92
93 Returns:
94 `KorAPQuery` | `pandas.DataFrame`
95
96 Example:
97 ```
98 $ kcon = KorAPConnection(verbose=True)
99 $ q = kcon.corpusQuery("Ameisenplage")
100 $ q = q.fetchAll()
101 $ q.slots['collectedMatches']
102 corpusSigle ... textClass
103 1 WPD17 ... NaN
104 2 WPD17 ... NaN
105 3 WPD17 ... NaN
106 4 WPD17 ... NaN
107 5 WPD17 ... NaN
108 .. ... ... ...
109 126 Z83 ... freizeit-unterhaltung reisen
110 127 MZE03 ... freizeit-unterhaltung reisen natur-umwelt wett...
111 128 MZE03 ... freizeit-unterhaltung reisen staat-gesellschaf...
112 129 MZE14 ... wissenschaft populaerwissenschaft freizeit-unt...
113 130 MZE00 ... wissenschaft populaerwissenschaft
114 [130 rows x 6 columns]
115 ```
116 """
Marc Kupietz6d539742020-06-25 18:33:20 +0200117 return KorAPQuery(self, *args, **kwargs)
118
119
120class KorAPQuery(RS4):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200121 """Query to a KorAP server."""
122
123
Marc Kupietz6d539742020-06-25 18:33:20 +0200124 def __init__(self, *args, **kwargs):
125 kco = KorAPClient.corpusQuery(*args, **kwargs)
126 super().__init__(kco)
127
128 def fetchNext(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200129 """Fetch next couple of query results
130
131 - **offset** – start offset for query results to fetch
132 - **maxFetch** – maximum number of query results to fetch
133 - **verbose**
134
135 Returns:
136 `KorAPQuery`
137 """
Marc Kupietz6d539742020-06-25 18:33:20 +0200138 return KorAPClient.fetchNext(self, *args, **kwargs)
139
140 def fetchRest(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200141 """Fetch remaining query results
142
143 - **verbose**
144
145 Returns:
146 `KorAPQuery`
147 """
Marc Kupietz6d539742020-06-25 18:33:20 +0200148 return KorAPClient.fetchRest(self, *args, **kwargs)
149
150 def fetchAll(self, *args, **kwargs):
Marc Kupietz7494fad2020-06-26 22:44:53 +0200151 """Fetch all query results
152
153 - **verbose**
154
155 Returns:
156 `KorAPQuery`
157
158 Example:
159 See `KorAPConnection.corpusQuery`.
160 """
Marc Kupietz6d539742020-06-25 18:33:20 +0200161 return KorAPClient.fetchAll(self, *args, **kwargs)