Marc Kupietz | 240ff89 | 2021-05-04 08:19:08 +0200 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | |
| 3 | from KorAPClient import KorAPConnection |
| 4 | import plotly.express as px |
| 5 | import pandas as pd |
| 6 | |
| 7 | startYear = 1991 |
| 8 | endYear = 2020 |
| 9 | span = 5 |
| 10 | |
| 11 | NODE = "Ei" |
| 12 | COLLOCATES = ["pellen", "schälen"] |
| 13 | COUNTRIES = ["DE", "AT", "CH"] |
| 14 | |
| 15 | TITLE = f"Collocation strength of <i>{NODE} + {' / '.join(COLLOCATES)} </i> in {', '.join(COUNTRIES)} {startYear}-{endYear}" |
| 16 | |
| 17 | YEARS = [y for y in range(startYear, endYear, span)] |
| 18 | |
| 19 | # build all combinations of all variables |
| 20 | df = pd.DataFrame(YEARS, columns=["year"]) \ |
| 21 | .merge(pd.DataFrame(COUNTRIES, columns=["Country"]), how='cross') \ |
| 22 | .merge(pd.DataFrame(COLLOCATES, columns=["Collocate"]), how='cross') |
| 23 | |
| 24 | # add column with virtual corpus specifications based on Country and year variables |
| 25 | df['vc'] = [ |
| 26 | f"textType=/Zeit.*/ & pubPlaceKey={df['Country'][i]} & pubDate since {df['year'][i]} & pubDate until {df['year'][i] + span - 1} " |
| 27 | for i in range(0, len(df.index))] |
| 28 | |
| 29 | # add column with label for x axis |
| 30 | df['Period'] = [f"{df['year'][i]}-{df['year'][i] + span - 1}" for i in range(0, len(df.index))] |
| 31 | |
| 32 | # connect to KorAP API server |
| 33 | kcon = KorAPConnection(verbose=True) |
| 34 | |
| 35 | # perform the actual KorAP query |
| 36 | results = kcon.collocationScoreQuery(NODE, df['Collocate'], df['vc'], lemmatizeNodeQuery=True, |
| 37 | lemmatizeCollocateQuery=True) |
| 38 | |
| 39 | # join query result columns (axis=1 ...) with condition information columns |
| 40 | # (why is reset_index needed?) |
| 41 | df = pd.concat([df.reset_index(drop=True), results.reset_index(drop=True)], axis=1) |
| 42 | |
| 43 | fig = px.line(df, title=TITLE, x="Period", y="logDice", color="Country", line_dash="Collocate") |
| 44 | fig.show() |
| 45 | # fig.write_image(f"{NODE}_collocates_{startYear}-{endYear}_in_{'_'.join(COUNTRIES)}.png") |