blob: 3957b9d361419c2da0391256fde82916dab368cd [file] [log] [blame]
feldmueller934e9af2025-04-14 17:36:32 +02001{
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": 1,
6 "metadata": {},
7 "outputs": [
8 {
9 "name": "stderr",
10 "output_type": "stream",
11 "text": [
12 "R[write to console]: Welcome to KorAP API for DeLiKo@DNB!\n",
13 "\n"
14 ]
15 },
16 {
17 "name": "stdout",
18 "output_type": "stream",
19 "text": [
20 "Searching \"[tt/l=Korpus]\" in \"\"\u001b[0m by this KorAP instance.\u001b[0m\u001b[32m\u001b[32m: 771 hits\u001b[0m\u001b[32m, took 0.11682251 s\n",
21 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 1/16 in 0.31968938 s\n",
22 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 2/16 in 0.384494092 s\n",
23 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 3/16 in 0.229332277 s\n",
24 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 3/16 in 0.229332277 s\n",
25 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 4/16 in 0.229332277 s\n",
26 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 5/16 in 0.397982977 s\n",
27 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 6/16 in 0.467999036 s\n",
28 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 7/16 in 1.06042362 s\n",
29 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 8/16 in 0.504681485 s\n",
30 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 9/16 in 0.436383899 s\n",
31 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 10/16 in 0.621623505 s\n",
32 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 11/16 in 0.258548927 s\n",
33 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 12/16 in 0.433153669 s\n",
34 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 13/16 in 0.432334822 s\n",
35 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 14/16 in 0.399952292 s\n",
36 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 15/16 in 0.400414105 s\n",
37 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 16/16 in 0.731605508 s\n",
38 "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 17/16 in 0.173677668 s\n",
39 "\u001b[0m"
40 ]
41 }
42 ],
43 "source": [
44 "from KorAPClient import KorAPConnection\n",
45 "from rpy2.robjects import r\n",
46 "\n",
47 "# As base, use the fiction corpus DeLiKo@DNB (see <https://doi.org/10.5281/zenodo.14943116>)\n",
48 "kcon = KorAPConnection(KorAPUrl=\"https://korap.dnb.de/\", verbose=True).auth()\n",
49 "\n",
50 "r['set.seed'](42) # Set the seed for reproducibility, will in future be exported by KorAPClient\n",
51 "q = kcon.corpusQuery(\"[tt/l=Korpus]\", metadataOnly=False)\n",
52 "q = q.fetchNext(maxFetch=1000, randomizePageOrder=True)"
53 ]
54 },
55 {
56 "cell_type": "code",
57 "execution_count": null,
58 "metadata": {},
59 "outputs": [
60 {
61 "name": "stderr",
62 "output_type": "stream",
63 "text": [
64 "/var/folders/76/__9t5rnd5k94skg1118jhpw00000gn/T/ipykernel_77388/1758551592.py:36: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
65 " matches = tokens[tokens['offset'] == 0].groupby('line_id').apply(\n"
66 ]
67 }
68 ],
69 "source": [
70 "results = q.slots['collectedMatches']\n",
71 "# use index as line_id\n",
72 "results[\"line_id\"] = results.index\n",
73 "\n",
74 "# take all columns except snippet, tokens.left, tokens.match, tokens.right, matchStart, matchEnd\n",
75 "drop_columns = ['snippet', 'tokens.left', 'tokens.match', 'tokens.right', 'matchStart', 'matchEnd']\n",
76 "metadata = results.drop(columns=drop_columns)\n",
77 "\n",
78 "tokens = []\n",
79 "for _, line in results.iterrows():\n",
80 " left_context, match, right_context = (line[col].split(\"\\t\") for col in ['tokens.left', 'tokens.match', 'tokens.right'])\n",
81 " id_in_line = -1 # will be obsolete in later version of flexiconc\n",
82 " \n",
83 " if left_context != [\"\"]:\n",
84 " for i, token in enumerate(left_context):\n",
85 " id_in_line += 1\n",
86 " offset = 0 - len(left_context) + i\n",
87 " tokens.append([offset, token, line[\"line_id\"], id_in_line])\n",
88 " \n",
89 " for i, token in enumerate(match):\n",
90 " id_in_line += 1\n",
91 " offset = 0\n",
92 " tokens.append([offset, token, line[\"line_id\"], id_in_line])\n",
93 " \n",
94 " if right_context != [\"\"]:\n",
95 " for i, token in enumerate(right_context):\n",
96 " id_in_line += 1\n",
97 " offset = i + 1\n",
98 " tokens.append([offset, token, line[\"line_id\"], id_in_line])\n",
99 "\n",
100 "# create a dataframe from the tokens list\n",
101 "import pandas as pd\n",
102 "tokens = pd.DataFrame(tokens, columns=[\"offset\", \"word\", \"line_id\", \"id_in_line\"])\n",
103 "\n",
104 "# matches df will be obsolete in later version of flexiconc as \"offset\" is included in tokens\n",
105 "# Create the matches DataFrame using the index directly for aggregation\n",
106 "matches = tokens[tokens['offset'] == 0].groupby('line_id').apply(\n",
107 " lambda group: pd.Series({\n",
108 " 'match_start': group.index.min(), # Get the minimum index value for match_start\n",
109 " 'match_end': group.index.max() # Get the maximum index value for match_end\n",
110 " })\n",
111 ").reset_index()\n",
112 "\n",
113 "# Add 'slot' column to the matches DataFrame and populate it with 0's\n",
114 "matches['slot'] = 0"
115 ]
116 },
117 {
118 "cell_type": "code",
119 "execution_count": null,
120 "metadata": {},
121 "outputs": [],
122 "source": [
123 "from flexiconc.concordance import Concordance\n",
124 "\n",
125 "# Create the Concordance object\n",
126 "c = Concordance()\n",
127 "\n",
128 "# Option A: Tokens file includes 'offset'\n",
129 "c.load(\n",
130 " metadata=metadata,\n",
131 " tokens=tokens,\n",
132 " matches=matches\n",
133 ")\n"
134 ]
135 },
136 {
137 "cell_type": "code",
138 "execution_count": 9,
139 "metadata": {},
140 "outputs": [
141 {
142 "data": {
143 "text/html": [
144 "<div style='margin-bottom:10px;'><strong>Query:</strong> </div>\n",
145 "<ul style='list-style-type:none;'>\n",
146 "<li>[1] 🔎 subset (813): </li>\n",
147 "</ul>\n"
148 ],
149 "text/plain": [
150 "<IPython.core.display.HTML object>"
151 ]
152 },
153 "metadata": {},
154 "output_type": "display_data"
155 }
156 ],
157 "source": [
158 "from IPython.display import HTML, display\n",
159 "from flexiconc.visualization.html_visualizer import generate_concordance_html, generate_analysis_tree_html\n",
160 "display(HTML(generate_analysis_tree_html(c)))"
161 ]
162 },
163 {
164 "cell_type": "code",
165 "execution_count": 10,
166 "metadata": {},
167 "outputs": [
168 {
169 "data": {
170 "text/html": [
171 "\n",
172 " <style>\n",
173 " table.concordance {\n",
174 " border-collapse: collapse;\n",
175 " width: 100%;\n",
176 " table-layout: auto;\n",
177 " }\n",
178 " table.concordance th, table.concordance td {\n",
179 " border: 1px solid #dddddd;\n",
180 " padding: 4px;\n",
181 " vertical-align: top;\n",
182 " white-space: nowrap;\n",
183 " overflow: hidden;\n",
184 " text-overflow: ellipsis;\n",
185 " }\n",
186 " table.concordance th {\n",
187 " background-color: #f2f2f2;\n",
188 " text-align: center;\n",
189 " }\n",
190 " table.concordance th.line-id, table.concordance td.line-id {\n",
191 " text-align: center;\n",
192 " white-space: nowrap;\n",
193 " }\n",
194 " table.concordance th.metadata, table.concordance td.metadata {\n",
195 " text-align: center;\n",
196 " white-space: nowrap;\n",
197 " }\n",
198 " table.concordance th.left-context, table.concordance td.left-context {\n",
199 " text-align: right;\n",
200 " overflow: hidden;\n",
201 " white-space: nowrap;\n",
202 " width: 40%;\n",
203 " max-width: 0px;\n",
204 " }\n",
205 " table.concordance th.node, table.concordance td.node {\n",
206 " text-align: center;\n",
207 " font-weight: bold;\n",
208 " white-space: nowrap;\n",
209 " }\n",
210 " table.concordance th.right-context, table.concordance td.right-context {\n",
211 " text-align: left;\n",
212 " overflow: hidden;\n",
213 " white-space: nowrap;\n",
214 " width: 40%;\n",
215 " max-width: 0px;\n",
216 " }\n",
217 " table.concordance div.left-context {\n",
218 " float: right;\n",
219 " white-space: nowrap;\n",
220 " }\n",
221 " table.concordance div.right-context {\n",
222 " float: left;\n",
223 " white-space: nowrap;\n",
224 " }\n",
225 " </style>\n",
226 " <table class=\"concordance\">\n",
227 " <colgroup>\n",
228 " <col>\n",
229 " <col>\n",
230 " <col>\n",
231 " <col>\n",
232 " </colgroup>\n",
233 " <tr>\n",
234 " <th class=\"line-id\">Line ID</th>\n",
235 " <th class=\"left-context\">Left Context</th>\n",
236 " <th class=\"node\">Node</th>\n",
237 " <th class=\"right-context\">Right Context</th>\n",
238 " </tr>\n",
239 " \n",
240 " <tr>\n",
241 " <td class=\"line-id\">0</td>\n",
242 " \n",
243 " <td class=\"left-context\"><div class=\"left-context\">des Attila, der seinen stattlichen</div></td>\n",
244 " <td class=\"node\">Korpus</td>\n",
245 " <td class=\"right-context\"><div class=\"right-context\">viel zu eng umspannt hielt</div></td>\n",
246 " </tr>\n",
247 " \n",
248 " <tr>\n",
249 " <td class=\"line-id\">1</td>\n",
250 " \n",
251 " <td class=\"left-context\"><div class=\"left-context\">– einem mannshohen Streichinstrument mit gekrümmtem</div></td>\n",
252 " <td class=\"node\">Korpus</td>\n",
253 " <td class=\"right-context\"><div class=\"right-context\">in Form eines Halbmonds,</div></td>\n",
254 " </tr>\n",
255 " \n",
256 " <tr>\n",
257 " <td class=\"line-id\">2</td>\n",
258 " \n",
259 " <td class=\"left-context\"><div class=\"left-context\">an ihren Trommelfellen und brachte den</div></td>\n",
260 " <td class=\"node\">Korpus</td>\n",
261 " <td class=\"right-context\"><div class=\"right-context\">des Wagens zum Wackeln.</div></td>\n",
262 " </tr>\n",
263 " \n",
264 " <tr>\n",
265 " <td class=\"line-id\">3</td>\n",
266 " \n",
267 " <td class=\"left-context\"><div class=\"left-context\">an den Schnittstellen zwischen Tür und</div></td>\n",
268 " <td class=\"node\">Korpus</td>\n",
269 " <td class=\"right-context\"><div class=\"right-context\">. Genau da, wo</div></td>\n",
270 " </tr>\n",
271 " \n",
272 " <tr>\n",
273 " <td class=\"line-id\">4</td>\n",
274 " \n",
275 " <td class=\"left-context\"><div class=\"left-context\">ein halbes Dutzend Klingen aus dem</div></td>\n",
276 " <td class=\"node\">Korpus</td>\n",
277 " <td class=\"right-context\"><div class=\"right-context\">schnellte. Und dann kam</div></td>\n",
278 " </tr>\n",
279 " \n",
280 " <tr>\n",
281 " <td class=\"line-id\">5</td>\n",
282 " \n",
283 " <td class=\"left-context\"><div class=\"left-context\">gebürstete Oberfläche seiner Gliedmaßen verlieh dem</div></td>\n",
284 " <td class=\"node\">Korpus</td>\n",
285 " <td class=\"right-context\"><div class=\"right-context\">etwas Edles, während die</div></td>\n",
286 " </tr>\n",
287 " \n",
288 " <tr>\n",
289 " <td class=\"line-id\">6</td>\n",
290 " \n",
291 " <td class=\"left-context\"><div class=\"left-context\">Multitool, ein Reparaturbot mit flachem</div></td>\n",
292 " <td class=\"node\">Korpus</td>\n",
293 " <td class=\"right-context\"><div class=\"right-context\">, der seine sechs flexiblen</div></td>\n",
294 " </tr>\n",
295 " \n",
296 " <tr>\n",
297 " <td class=\"line-id\">7</td>\n",
298 " \n",
299 " <td class=\"left-context\"><div class=\"left-context\">gehabt, die Kennung an seinem</div></td>\n",
300 " <td class=\"node\">Korpus</td>\n",
301 " <td class=\"right-context\"><div class=\"right-context\">auszumachen, würde aber schnellstmöglich</div></td>\n",
302 " </tr>\n",
303 " \n",
304 " <tr>\n",
305 " <td class=\"line-id\">8</td>\n",
306 " \n",
307 " <td class=\"left-context\"><div class=\"left-context\">Art von Reparaturbots, mit flachem</div></td>\n",
308 " <td class=\"node\">Korpus</td>\n",
309 " <td class=\"right-context\"><div class=\"right-context\">und mehreren Gliedmaßen, die</div></td>\n",
310 " </tr>\n",
311 " \n",
312 " <tr>\n",
313 " <td class=\"line-id\">9</td>\n",
314 " \n",
315 " <td class=\"left-context\"><div class=\"left-context\">und mit gesplittertem Visor, der</div></td>\n",
316 " <td class=\"node\">Korpus</td>\n",
317 " <td class=\"right-context\"><div class=\"right-context\">eines Roboters, dessen Arme</div></td>\n",
318 " </tr>\n",
319 " \n",
320 " <tr>\n",
321 " <td class=\"line-id\">10</td>\n",
322 " \n",
323 " <td class=\"left-context\"><div class=\"left-context\">ihn ein, dass der metallene</div></td>\n",
324 " <td class=\"node\">Korpus</td>\n",
325 " <td class=\"right-context\"><div class=\"right-context\">sich innerhalb weniger Schläge bis</div></td>\n",
326 " </tr>\n",
327 " \n",
328 " <tr>\n",
329 " <td class=\"line-id\">11</td>\n",
330 " \n",
331 " <td class=\"left-context\"><div class=\"left-context\">ab und stieß den übrig gebliebenen</div></td>\n",
332 " <td class=\"node\">Korpus</td>\n",
333 " <td class=\"right-context\"><div class=\"right-context\">mit dem Fuß hinüber zu</div></td>\n",
334 " </tr>\n",
335 " \n",
336 " <tr>\n",
337 " <td class=\"line-id\">12</td>\n",
338 " \n",
339 " <td class=\"left-context\"><div class=\"left-context\">Während die meisten dieser Schüsse vom</div></td>\n",
340 " <td class=\"node\">Korpus</td>\n",
341 " <td class=\"right-context\"><div class=\"right-context\">des Exoskeletts abprallten, erschienen</div></td>\n",
342 " </tr>\n",
343 " \n",
344 " <tr>\n",
345 " <td class=\"line-id\">13</td>\n",
346 " \n",
347 " <td class=\"left-context\"><div class=\"left-context\">. Mehr nicht. Als der</div></td>\n",
348 " <td class=\"node\">Korpus</td>\n",
349 " <td class=\"right-context\"><div class=\"right-context\">des Roboters neben dem ÜberBot</div></td>\n",
350 " </tr>\n",
351 " \n",
352 " <tr>\n",
353 " <td class=\"line-id\">14</td>\n",
354 " \n",
355 " <td class=\"left-context\"><div class=\"left-context\">Multitool, ein Reparaturbot mit flachem</div></td>\n",
356 " <td class=\"node\">Korpus</td>\n",
357 " <td class=\"right-context\"><div class=\"right-context\">und sechs flexiblen ausfahrbaren Gliedmaßen</div></td>\n",
358 " </tr>\n",
359 " \n",
360 " <tr>\n",
361 " <td class=\"line-id\">15</td>\n",
362 " \n",
363 " <td class=\"left-context\"><div class=\"left-context\">heißt, wenn du deinen eigenen</div></td>\n",
364 " <td class=\"node\">Korpus</td>\n",
365 " <td class=\"right-context\"><div class=\"right-context\">in Sicherheit bringen möchtest.</div></td>\n",
366 " </tr>\n",
367 " \n",
368 " <tr>\n",
369 " <td class=\"line-id\">16</td>\n",
370 " \n",
371 " <td class=\"left-context\"><div class=\"left-context\">der erstmals im Jahre 1706 gesammelte</div></td>\n",
372 " <td class=\"node\">Korpus</td>\n",
373 " <td class=\"right-context\"><div class=\"right-context\">von Tang-Lyrik aufweist: Fast</div></td>\n",
374 " </tr>\n",
375 " \n",
376 " <tr>\n",
377 " <td class=\"line-id\">17</td>\n",
378 " \n",
379 " <td class=\"left-context\"><div class=\"left-context\">Gesicht auf einem wie Espenlaub bebenden</div></td>\n",
380 " <td class=\"node\">Korpus</td>\n",
381 " <td class=\"right-context\"><div class=\"right-context\">, – ja in der</div></td>\n",
382 " </tr>\n",
383 " \n",
384 " <tr>\n",
385 " <td class=\"line-id\">18</td>\n",
386 " \n",
387 " <td class=\"left-context\"><div class=\"left-context\">dreißig Fatras sind das größte bekannte</div></td>\n",
388 " <td class=\"node\">Korpus</td>\n",
389 " <td class=\"right-context\"><div class=\"right-context\">. In einem mittelalterlichen Manuskript</div></td>\n",
390 " </tr>\n",
391 " \n",
392 " <tr>\n",
393 " <td class=\"line-id\">19</td>\n",
394 " \n",
395 " <td class=\"left-context\"><div class=\"left-context\">– also die Hälfte des bekannten</div></td>\n",
396 " <td class=\"node\">Korpus</td>\n",
397 " <td class=\"right-context\"><div class=\"right-context\">– wurden aufgenommen. Sie</div></td>\n",
398 " </tr>\n",
399 " </table>"
400 ],
401 "text/plain": [
402 "<IPython.core.display.HTML object>"
403 ]
404 },
405 "metadata": {},
406 "output_type": "display_data"
407 }
408 ],
409 "source": [
410 "display(HTML(generate_concordance_html(c, c.root, n=20)))"
411 ]
412 }
413 ],
414 "metadata": {
415 "kernelspec": {
416 "display_name": "KorApClient",
417 "language": "python",
418 "name": "python3"
419 },
420 "language_info": {
421 "codemirror_mode": {
422 "name": "ipython",
423 "version": 3
424 },
425 "file_extension": ".py",
426 "mimetype": "text/x-python",
427 "name": "python",
428 "nbconvert_exporter": "python",
429 "pygments_lexer": "ipython3",
430 "version": "3.12.0"
431 }
432 },
433 "nbformat": 4,
434 "nbformat_minor": 2
435}