| Marc Kupietz | 240ff89 | 2021-05-04 08:19:08 +0200 | [diff] [blame] | 1 | #!/usr/bin/env python3 | 
 | 2 |  | 
 | 3 | from KorAPClient import KorAPConnection | 
 | 4 | import plotly.express as px | 
 | 5 | import pandas as pd | 
 | 6 |  | 
 | 7 | startYear = 1991 | 
 | 8 | endYear = 2020 | 
 | 9 | span = 5 | 
 | 10 |  | 
 | 11 | NODE = "Ei" | 
 | 12 | COLLOCATES = ["pellen", "schälen"] | 
 | 13 | COUNTRIES = ["DE", "AT", "CH"] | 
 | 14 |  | 
 | 15 | TITLE = f"Collocation strength of <i>{NODE} + {' / '.join(COLLOCATES)} </i> in {', '.join(COUNTRIES)} {startYear}-{endYear}" | 
 | 16 |  | 
 | 17 | YEARS = [y for y in range(startYear, endYear, span)] | 
 | 18 |  | 
 | 19 | # build all combinations of all variables | 
 | 20 | df = pd.DataFrame(YEARS, columns=["year"]) \ | 
 | 21 |     .merge(pd.DataFrame(COUNTRIES, columns=["Country"]), how='cross') \ | 
 | 22 |     .merge(pd.DataFrame(COLLOCATES, columns=["Collocate"]), how='cross') | 
 | 23 |  | 
 | 24 | # add column with virtual corpus specifications based on Country and year variables | 
 | 25 | df['vc'] = [ | 
 | 26 |     f"textType=/Zeit.*/ & pubPlaceKey={df['Country'][i]} & pubDate since {df['year'][i]} & pubDate until {df['year'][i] + span - 1} " | 
 | 27 |     for i in range(0, len(df.index))] | 
 | 28 |  | 
 | 29 | # add column with label for x axis | 
 | 30 | df['Period'] = [f"{df['year'][i]}-{df['year'][i] + span - 1}" for i in range(0, len(df.index))] | 
 | 31 |  | 
 | 32 | # connect to KorAP API server | 
 | 33 | kcon = KorAPConnection(verbose=True) | 
 | 34 |  | 
 | 35 | # perform the actual KorAP query | 
 | 36 | results = kcon.collocationScoreQuery(NODE, df['Collocate'], df['vc'], lemmatizeNodeQuery=True, | 
 | 37 |                                      lemmatizeCollocateQuery=True) | 
 | 38 |  | 
 | 39 | # join query result columns (axis=1 ...) with condition information columns | 
 | 40 | # (why is reset_index needed?) | 
 | 41 | df = pd.concat([df.reset_index(drop=True), results.reset_index(drop=True)], axis=1) | 
 | 42 |  | 
 | 43 | fig = px.line(df, title=TITLE, x="Period", y="logDice", color="Country", line_dash="Collocate") | 
 | 44 | fig.show() | 
 | 45 | # fig.write_image(f"{NODE}_collocates_{startYear}-{endYear}_in_{'_'.join(COUNTRIES)}.png") |