| { |
| "cells": [ |
| { |
| "cell_type": "code", |
| "execution_count": 1, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| "R[write to console]: Welcome to KorAP API for DeLiKo@DNB!\n", |
| "\n" |
| ] |
| }, |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Searching \"[tt/l=Korpus]\" in \"\"\u001b[0m by this KorAP instance.\u001b[0m\u001b[32m\u001b[32m: 771 hits\u001b[0m\u001b[32m, took 0.11682251 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 1/16 in 0.31968938 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 2/16 in 0.384494092 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 3/16 in 0.229332277 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 3/16 in 0.229332277 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 4/16 in 0.229332277 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 5/16 in 0.397982977 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 6/16 in 0.467999036 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 7/16 in 1.06042362 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 8/16 in 0.504681485 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 9/16 in 0.436383899 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 10/16 in 0.621623505 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 11/16 in 0.258548927 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 12/16 in 0.433153669 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 13/16 in 0.432334822 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 14/16 in 0.399952292 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 15/16 in 0.400414105 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 16/16 in 0.731605508 s\n", |
| "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 17/16 in 0.173677668 s\n", |
| "\u001b[0m" |
| ] |
| } |
| ], |
| "source": [ |
| "from KorAPClient import KorAPConnection\n", |
| "from rpy2.robjects import r\n", |
| "\n", |
| "# As base, use the fiction corpus DeLiKo@DNB (see <https://doi.org/10.5281/zenodo.14943116>)\n", |
| "kcon = KorAPConnection(KorAPUrl=\"https://korap.dnb.de/\", verbose=True).auth()\n", |
| "\n", |
| "r['set.seed'](42) # Set the seed for reproducibility, will in future be exported by KorAPClient\n", |
| "q = kcon.corpusQuery(\"[tt/l=Korpus]\", metadataOnly=False)\n", |
| "q = q.fetchNext(maxFetch=1000, randomizePageOrder=True)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| "/var/folders/76/__9t5rnd5k94skg1118jhpw00000gn/T/ipykernel_77388/1758551592.py:36: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", |
| " matches = tokens[tokens['offset'] == 0].groupby('line_id').apply(\n" |
| ] |
| } |
| ], |
| "source": [ |
| "results = q.slots['collectedMatches']\n", |
| "# use index as line_id\n", |
| "results[\"line_id\"] = results.index\n", |
| "\n", |
| "# take all columns except snippet, tokens.left, tokens.match, tokens.right, matchStart, matchEnd\n", |
| "drop_columns = ['snippet', 'tokens.left', 'tokens.match', 'tokens.right', 'matchStart', 'matchEnd']\n", |
| "metadata = results.drop(columns=drop_columns)\n", |
| "\n", |
| "tokens = []\n", |
| "for _, line in results.iterrows():\n", |
| " left_context, match, right_context = (line[col].split(\"\\t\") for col in ['tokens.left', 'tokens.match', 'tokens.right'])\n", |
| " id_in_line = -1 # will be obsolete in later version of flexiconc\n", |
| " \n", |
| " if left_context != [\"\"]:\n", |
| " for i, token in enumerate(left_context):\n", |
| " id_in_line += 1\n", |
| " offset = 0 - len(left_context) + i\n", |
| " tokens.append([offset, token, line[\"line_id\"], id_in_line])\n", |
| " \n", |
| " for i, token in enumerate(match):\n", |
| " id_in_line += 1\n", |
| " offset = 0\n", |
| " tokens.append([offset, token, line[\"line_id\"], id_in_line])\n", |
| " \n", |
| " if right_context != [\"\"]:\n", |
| " for i, token in enumerate(right_context):\n", |
| " id_in_line += 1\n", |
| " offset = i + 1\n", |
| " tokens.append([offset, token, line[\"line_id\"], id_in_line])\n", |
| "\n", |
| "# create a dataframe from the tokens list\n", |
| "import pandas as pd\n", |
| "tokens = pd.DataFrame(tokens, columns=[\"offset\", \"word\", \"line_id\", \"id_in_line\"])\n", |
| "\n", |
| "# matches df will be obsolete in later version of flexiconc as \"offset\" is included in tokens\n", |
| "# Create the matches DataFrame using the index directly for aggregation\n", |
| "matches = tokens[tokens['offset'] == 0].groupby('line_id').apply(\n", |
| " lambda group: pd.Series({\n", |
| " 'match_start': group.index.min(), # Get the minimum index value for match_start\n", |
| " 'match_end': group.index.max() # Get the maximum index value for match_end\n", |
| " })\n", |
| ").reset_index()\n", |
| "\n", |
| "# Add 'slot' column to the matches DataFrame and populate it with 0's\n", |
| "matches['slot'] = 0" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "from flexiconc.concordance import Concordance\n", |
| "\n", |
| "# Create the Concordance object\n", |
| "c = Concordance()\n", |
| "\n", |
| "# Option A: Tokens file includes 'offset'\n", |
| "c.load(\n", |
| " metadata=metadata,\n", |
| " tokens=tokens,\n", |
| " matches=matches\n", |
| ")\n" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 9, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "data": { |
| "text/html": [ |
| "<div style='margin-bottom:10px;'><strong>Query:</strong> </div>\n", |
| "<ul style='list-style-type:none;'>\n", |
| "<li>[1] 🔎 subset (813): </li>\n", |
| "</ul>\n" |
| ], |
| "text/plain": [ |
| "<IPython.core.display.HTML object>" |
| ] |
| }, |
| "metadata": {}, |
| "output_type": "display_data" |
| } |
| ], |
| "source": [ |
| "from IPython.display import HTML, display\n", |
| "from flexiconc.visualization.html_visualizer import generate_concordance_html, generate_analysis_tree_html\n", |
| "display(HTML(generate_analysis_tree_html(c)))" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 10, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "data": { |
| "text/html": [ |
| "\n", |
| " <style>\n", |
| " table.concordance {\n", |
| " border-collapse: collapse;\n", |
| " width: 100%;\n", |
| " table-layout: auto;\n", |
| " }\n", |
| " table.concordance th, table.concordance td {\n", |
| " border: 1px solid #dddddd;\n", |
| " padding: 4px;\n", |
| " vertical-align: top;\n", |
| " white-space: nowrap;\n", |
| " overflow: hidden;\n", |
| " text-overflow: ellipsis;\n", |
| " }\n", |
| " table.concordance th {\n", |
| " background-color: #f2f2f2;\n", |
| " text-align: center;\n", |
| " }\n", |
| " table.concordance th.line-id, table.concordance td.line-id {\n", |
| " text-align: center;\n", |
| " white-space: nowrap;\n", |
| " }\n", |
| " table.concordance th.metadata, table.concordance td.metadata {\n", |
| " text-align: center;\n", |
| " white-space: nowrap;\n", |
| " }\n", |
| " table.concordance th.left-context, table.concordance td.left-context {\n", |
| " text-align: right;\n", |
| " overflow: hidden;\n", |
| " white-space: nowrap;\n", |
| " width: 40%;\n", |
| " max-width: 0px;\n", |
| " }\n", |
| " table.concordance th.node, table.concordance td.node {\n", |
| " text-align: center;\n", |
| " font-weight: bold;\n", |
| " white-space: nowrap;\n", |
| " }\n", |
| " table.concordance th.right-context, table.concordance td.right-context {\n", |
| " text-align: left;\n", |
| " overflow: hidden;\n", |
| " white-space: nowrap;\n", |
| " width: 40%;\n", |
| " max-width: 0px;\n", |
| " }\n", |
| " table.concordance div.left-context {\n", |
| " float: right;\n", |
| " white-space: nowrap;\n", |
| " }\n", |
| " table.concordance div.right-context {\n", |
| " float: left;\n", |
| " white-space: nowrap;\n", |
| " }\n", |
| " </style>\n", |
| " <table class=\"concordance\">\n", |
| " <colgroup>\n", |
| " <col>\n", |
| " <col>\n", |
| " <col>\n", |
| " <col>\n", |
| " </colgroup>\n", |
| " <tr>\n", |
| " <th class=\"line-id\">Line ID</th>\n", |
| " <th class=\"left-context\">Left Context</th>\n", |
| " <th class=\"node\">Node</th>\n", |
| " <th class=\"right-context\">Right Context</th>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">0</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">des Attila, der seinen stattlichen</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">viel zu eng umspannt hielt</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">1</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">– einem mannshohen Streichinstrument mit gekrümmtem</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">in Form eines Halbmonds,</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">2</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">an ihren Trommelfellen und brachte den</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">des Wagens zum Wackeln.</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">3</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">an den Schnittstellen zwischen Tür und</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">. Genau da, wo</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">4</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">ein halbes Dutzend Klingen aus dem</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">schnellte. Und dann kam</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">5</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">gebürstete Oberfläche seiner Gliedmaßen verlieh dem</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">etwas Edles, während die</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">6</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">Multitool, ein Reparaturbot mit flachem</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">, der seine sechs flexiblen</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">7</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">gehabt, die Kennung an seinem</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">auszumachen, würde aber schnellstmöglich</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">8</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">Art von Reparaturbots, mit flachem</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">und mehreren Gliedmaßen, die</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">9</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">und mit gesplittertem Visor, der</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">eines Roboters, dessen Arme</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">10</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">ihn ein, dass der metallene</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">sich innerhalb weniger Schläge bis</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">11</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">ab und stieß den übrig gebliebenen</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">mit dem Fuß hinüber zu</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">12</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">Während die meisten dieser Schüsse vom</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">des Exoskeletts abprallten, erschienen</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">13</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">. Mehr nicht. Als der</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">des Roboters neben dem ÜberBot</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">14</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">Multitool, ein Reparaturbot mit flachem</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">und sechs flexiblen ausfahrbaren Gliedmaßen</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">15</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">heißt, wenn du deinen eigenen</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">in Sicherheit bringen möchtest.</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">16</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">der erstmals im Jahre 1706 gesammelte</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">von Tang-Lyrik aufweist: Fast</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">17</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">Gesicht auf einem wie Espenlaub bebenden</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">, – ja in der</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">18</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">dreißig Fatras sind das größte bekannte</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">. In einem mittelalterlichen Manuskript</div></td>\n", |
| " </tr>\n", |
| " \n", |
| " <tr>\n", |
| " <td class=\"line-id\">19</td>\n", |
| " \n", |
| " <td class=\"left-context\"><div class=\"left-context\">– also die Hälfte des bekannten</div></td>\n", |
| " <td class=\"node\">Korpus</td>\n", |
| " <td class=\"right-context\"><div class=\"right-context\">– wurden aufgenommen. Sie</div></td>\n", |
| " </tr>\n", |
| " </table>" |
| ], |
| "text/plain": [ |
| "<IPython.core.display.HTML object>" |
| ] |
| }, |
| "metadata": {}, |
| "output_type": "display_data" |
| } |
| ], |
| "source": [ |
| "display(HTML(generate_concordance_html(c, c.root, n=20)))" |
| ] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "KorApClient", |
| "language": "python", |
| "name": "python3" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 3 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython3", |
| "version": "3.12.0" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 2 |
| } |