add notebook with basic flexiconc example
Change-Id: I09b6cb872d1bd74d173541dec8bf5ac9bd63d468
diff --git a/examples/example_flexiconc.ipynb b/examples/example_flexiconc.ipynb
new file mode 100644
index 0000000..3957b9d
--- /dev/null
+++ b/examples/example_flexiconc.ipynb
@@ -0,0 +1,435 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "R[write to console]: Welcome to KorAP API for DeLiKo@DNB!\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Searching \"[tt/l=Korpus]\" in \"\"\u001b[0m by this KorAP instance.\u001b[0m\u001b[32m\u001b[32m: 771 hits\u001b[0m\u001b[32m, took 0.11682251 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 1/16 in 0.31968938 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 2/16 in 0.384494092 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 3/16 in 0.229332277 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 3/16 in 0.229332277 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 4/16 in 0.229332277 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 5/16 in 0.397982977 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 6/16 in 0.467999036 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 7/16 in 1.06042362 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 8/16 in 0.504681485 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 9/16 in 0.436383899 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 10/16 in 0.621623505 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 11/16 in 0.258548927 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 12/16 in 0.433153669 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 13/16 in 0.432334822 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 14/16 in 0.399952292 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 15/16 in 0.400414105 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 16/16 in 0.731605508 s\n",
+ "\u001b[0m\u001b[32mUsing fields API: \u001b[0m\u001b[32mRetrieved page 17/16 in 0.173677668 s\n",
+ "\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "from KorAPClient import KorAPConnection\n",
+ "from rpy2.robjects import r\n",
+ "\n",
+ "# As base, use the fiction corpus DeLiKo@DNB (see <https://doi.org/10.5281/zenodo.14943116>)\n",
+ "kcon = KorAPConnection(KorAPUrl=\"https://korap.dnb.de/\", verbose=True).auth()\n",
+ "\n",
+ "r['set.seed'](42) # Set the seed for reproducibility, will in future be exported by KorAPClient\n",
+ "q = kcon.corpusQuery(\"[tt/l=Korpus]\", metadataOnly=False)\n",
+ "q = q.fetchNext(maxFetch=1000, randomizePageOrder=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/76/__9t5rnd5k94skg1118jhpw00000gn/T/ipykernel_77388/1758551592.py:36: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+ " matches = tokens[tokens['offset'] == 0].groupby('line_id').apply(\n"
+ ]
+ }
+ ],
+ "source": [
+ "results = q.slots['collectedMatches']\n",
+ "# use index as line_id\n",
+ "results[\"line_id\"] = results.index\n",
+ "\n",
+ "# take all columns except snippet, tokens.left, tokens.match, tokens.right, matchStart, matchEnd\n",
+ "drop_columns = ['snippet', 'tokens.left', 'tokens.match', 'tokens.right', 'matchStart', 'matchEnd']\n",
+ "metadata = results.drop(columns=drop_columns)\n",
+ "\n",
+ "tokens = []\n",
+ "for _, line in results.iterrows():\n",
+ " left_context, match, right_context = (line[col].split(\"\\t\") for col in ['tokens.left', 'tokens.match', 'tokens.right'])\n",
+ " id_in_line = -1 # will be obsolete in later version of flexiconc\n",
+ " \n",
+ " if left_context != [\"\"]:\n",
+ " for i, token in enumerate(left_context):\n",
+ " id_in_line += 1\n",
+ " offset = 0 - len(left_context) + i\n",
+ " tokens.append([offset, token, line[\"line_id\"], id_in_line])\n",
+ " \n",
+ " for i, token in enumerate(match):\n",
+ " id_in_line += 1\n",
+ " offset = 0\n",
+ " tokens.append([offset, token, line[\"line_id\"], id_in_line])\n",
+ " \n",
+ " if right_context != [\"\"]:\n",
+ " for i, token in enumerate(right_context):\n",
+ " id_in_line += 1\n",
+ " offset = i + 1\n",
+ " tokens.append([offset, token, line[\"line_id\"], id_in_line])\n",
+ "\n",
+ "# create a dataframe from the tokens list\n",
+ "import pandas as pd\n",
+ "tokens = pd.DataFrame(tokens, columns=[\"offset\", \"word\", \"line_id\", \"id_in_line\"])\n",
+ "\n",
+ "# matches df will be obsolete in later version of flexiconc as \"offset\" is included in tokens\n",
+ "# Create the matches DataFrame using the index directly for aggregation\n",
+ "matches = tokens[tokens['offset'] == 0].groupby('line_id').apply(\n",
+ " lambda group: pd.Series({\n",
+ " 'match_start': group.index.min(), # Get the minimum index value for match_start\n",
+ " 'match_end': group.index.max() # Get the maximum index value for match_end\n",
+ " })\n",
+ ").reset_index()\n",
+ "\n",
+ "# Add 'slot' column to the matches DataFrame and populate it with 0's\n",
+ "matches['slot'] = 0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from flexiconc.concordance import Concordance\n",
+ "\n",
+ "# Create the Concordance object\n",
+ "c = Concordance()\n",
+ "\n",
+ "# Option A: Tokens file includes 'offset'\n",
+ "c.load(\n",
+ " metadata=metadata,\n",
+ " tokens=tokens,\n",
+ " matches=matches\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div style='margin-bottom:10px;'><strong>Query:</strong> </div>\n",
+ "<ul style='list-style-type:none;'>\n",
+ "<li>[1] 🔎 subset (813): </li>\n",
+ "</ul>\n"
+ ],
+ "text/plain": [
+ "<IPython.core.display.HTML object>"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from IPython.display import HTML, display\n",
+ "from flexiconc.visualization.html_visualizer import generate_concordance_html, generate_analysis_tree_html\n",
+ "display(HTML(generate_analysis_tree_html(c)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " <style>\n",
+ " table.concordance {\n",
+ " border-collapse: collapse;\n",
+ " width: 100%;\n",
+ " table-layout: auto;\n",
+ " }\n",
+ " table.concordance th, table.concordance td {\n",
+ " border: 1px solid #dddddd;\n",
+ " padding: 4px;\n",
+ " vertical-align: top;\n",
+ " white-space: nowrap;\n",
+ " overflow: hidden;\n",
+ " text-overflow: ellipsis;\n",
+ " }\n",
+ " table.concordance th {\n",
+ " background-color: #f2f2f2;\n",
+ " text-align: center;\n",
+ " }\n",
+ " table.concordance th.line-id, table.concordance td.line-id {\n",
+ " text-align: center;\n",
+ " white-space: nowrap;\n",
+ " }\n",
+ " table.concordance th.metadata, table.concordance td.metadata {\n",
+ " text-align: center;\n",
+ " white-space: nowrap;\n",
+ " }\n",
+ " table.concordance th.left-context, table.concordance td.left-context {\n",
+ " text-align: right;\n",
+ " overflow: hidden;\n",
+ " white-space: nowrap;\n",
+ " width: 40%;\n",
+ " max-width: 0px;\n",
+ " }\n",
+ " table.concordance th.node, table.concordance td.node {\n",
+ " text-align: center;\n",
+ " font-weight: bold;\n",
+ " white-space: nowrap;\n",
+ " }\n",
+ " table.concordance th.right-context, table.concordance td.right-context {\n",
+ " text-align: left;\n",
+ " overflow: hidden;\n",
+ " white-space: nowrap;\n",
+ " width: 40%;\n",
+ " max-width: 0px;\n",
+ " }\n",
+ " table.concordance div.left-context {\n",
+ " float: right;\n",
+ " white-space: nowrap;\n",
+ " }\n",
+ " table.concordance div.right-context {\n",
+ " float: left;\n",
+ " white-space: nowrap;\n",
+ " }\n",
+ " </style>\n",
+ " <table class=\"concordance\">\n",
+ " <colgroup>\n",
+ " <col>\n",
+ " <col>\n",
+ " <col>\n",
+ " <col>\n",
+ " </colgroup>\n",
+ " <tr>\n",
+ " <th class=\"line-id\">Line ID</th>\n",
+ " <th class=\"left-context\">Left Context</th>\n",
+ " <th class=\"node\">Node</th>\n",
+ " <th class=\"right-context\">Right Context</th>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">0</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">des Attila, der seinen stattlichen</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">viel zu eng umspannt hielt</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">1</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">– einem mannshohen Streichinstrument mit gekrümmtem</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">in Form eines Halbmonds,</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">2</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">an ihren Trommelfellen und brachte den</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">des Wagens zum Wackeln.</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">3</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">an den Schnittstellen zwischen Tür und</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">. Genau da, wo</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">4</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">ein halbes Dutzend Klingen aus dem</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">schnellte. Und dann kam</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">5</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">gebürstete Oberfläche seiner Gliedmaßen verlieh dem</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">etwas Edles, während die</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">6</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">Multitool, ein Reparaturbot mit flachem</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">, der seine sechs flexiblen</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">7</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">gehabt, die Kennung an seinem</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">auszumachen, würde aber schnellstmöglich</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">8</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">Art von Reparaturbots, mit flachem</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">und mehreren Gliedmaßen, die</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">9</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">und mit gesplittertem Visor, der</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">eines Roboters, dessen Arme</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">10</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">ihn ein, dass der metallene</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">sich innerhalb weniger Schläge bis</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">11</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">ab und stieß den übrig gebliebenen</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">mit dem Fuß hinüber zu</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">12</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">Während die meisten dieser Schüsse vom</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">des Exoskeletts abprallten, erschienen</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">13</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">. Mehr nicht. Als der</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">des Roboters neben dem ÜberBot</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">14</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">Multitool, ein Reparaturbot mit flachem</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">und sechs flexiblen ausfahrbaren Gliedmaßen</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">15</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">heißt, wenn du deinen eigenen</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">in Sicherheit bringen möchtest.</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">16</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">der erstmals im Jahre 1706 gesammelte</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">von Tang-Lyrik aufweist: Fast</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">17</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">Gesicht auf einem wie Espenlaub bebenden</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">, – ja in der</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">18</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">dreißig Fatras sind das größte bekannte</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">. In einem mittelalterlichen Manuskript</div></td>\n",
+ " </tr>\n",
+ " \n",
+ " <tr>\n",
+ " <td class=\"line-id\">19</td>\n",
+ " \n",
+ " <td class=\"left-context\"><div class=\"left-context\">– also die Hälfte des bekannten</div></td>\n",
+ " <td class=\"node\">Korpus</td>\n",
+ " <td class=\"right-context\"><div class=\"right-context\">– wurden aufgenommen. Sie</div></td>\n",
+ " </tr>\n",
+ " </table>"
+ ],
+ "text/plain": [
+ "<IPython.core.display.HTML object>"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "display(HTML(generate_concordance_html(c, c.root, n=20)))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "KorApClient",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}