Party all night, get up at 12, enjoy life

# import tools # general tools import altair as alt import pandas as pd import requests # geovisualisation tools import geopandas as gpd from OSMPythonTools.overpass import overpassQueryBuilder from OSMPythonTools.nominatim import Nominatim nominatim = Nominatim() from OSMPythonTools.overpass import Overpass overpass = Overpass() from shapely.ops import orient # text processing tools import nltk nltk.download('punkt') nltk.download('wordnet') nltk.download('stopwords') nltk.download('averaged_perceptron_tagger') nltk.download('maxent_ne_chunker' ) nltk.download('omw-1.4') nltk.download('words') #!pip install SpaCy #!python -m spacy download en_core_web_sm #!pip install wordcloud #!pip install ipywidgets from nltk.stem import PorterStemmer as stemmer from nltk.stem import WordNetLemmatizer as lemmatizer from nltk.corpus import wordnet # for robust lemmatization

# find the universities of my survey friends # create a Dataframe with all the university names concrete_universities = pd.DataFrame({ "name": ["Universität Potsdam", "Fachhochschule Potsdam", "Freie Universität Berlin", "Universität Stuttgart", "Julius-Maximilians-Universität Würzburg", "Universität Regensburg", "Hochschule Heilbronn Campus Sontheim"], "city": ["Potsdam", "Potsdam", "Berlin", "Stuttgart", "Würzburg", "Regensburg", "Heilbronn"], "address": ["Am Neuen Palais 10, 14469 Potsdam", "Kiepenheuerallee 5, 14469 Potsdam", "Kaiserswerther Str. 16, 14195 Berlin", "Keplerstraße 7, 70174 Stuttgart","Sanderring 2, 97070 Würzburg", "Universitätsstraße 31, 93053 Regensburg", "Max-Planck-Straße 39, 74081 Heilbronn"], "amount": [3, 4, 1, 1, 2, 3, 1] } ) # import Nominatim as geopy geocoder from geopy.geocoders import Nominatim # register custom user agent (commercial services may also require an API key) geocoder = Nominatim(user_agent="Information Visualization Tutorial · FH Potsdam") # concrete_universities from geopy.extra.rate_limiter import RateLimiter # add a delay of one second between each geocoding request geocode = RateLimiter(geocoder.geocode, min_delay_seconds=1) # apply geocoding to address column; store responses in location column concrete_universities['location'] = concrete_universities['address'].apply(geocode) # create empty columns for coordinates concrete_universities.loc[:, "lat"] = None concrete_universities.loc[:, "lon"] = None # extract lat and lon from locations via one list comprehensions concrete_universities.loc[:, ['lat', 'lon']] = [ (loc.latitude, loc.longitude) for loc in concrete_universities['location'] ] # create GeoDataFrame, pointing explicitly to lon and lat columns concrete_universities = gpd.GeoDataFrame(concrete_universities, geometry=gpd.points_from_xy(concrete_universities.lon, concrete_universities.lat)) # remove superfluous columns that are not needed anymore concrete_universities = concrete_universities.drop(columns=['location', 'lat', 'lon']) concrete_universities

# visualize germany_map = gpd.read_file('/work/bundeslaender_simplify20.geojson') germany_map = germany_map.geometry.apply(orient, args=(-1,)) # basemap basemap_germany = alt.Chart(germany_map).mark_geoshape( fill="#F7F3dF", stroke="#C4AE7C" ).properties(width=500, height=500) # from shapely.ops import orient # version >=1.7a2 # markers markers = alt.Chart(concrete_universities).mark_circle(opacity=1).encode( longitude='geometry.coordinates[0]:Q', latitude='geometry.coordinates[1]:Q', size = "amount", color=alt.value('#6FA097'), tooltip=['name:N'] ) # basemap_germany where_are_they = basemap_germany + markers where_are_they.properties( title = "Where are my participants?" ).configure_title( fontSize=15, offset=20, orient='top', anchor='middle')

#import texts file1 = open("good_emotions.txt") good_emotions = file1.read() file1.close() file2 = open("bad_emotions.txt") bad_emotions = file2.read() file2.close() # take a look at their written emotions print("Good emotions: ") print(" ") print(good_emotions) print(" ") print("Bad emotions: ") print(" ") print(bad_emotions)

# NOTE: PLEASE LOAD THIS CODE TWICE # tokenize good_emotions_tokens = nltk.word_tokenize(good_emotions) bad_emotions_tokens = nltk.word_tokenize(bad_emotions) # onlywords good_emotions_onlywords = [word.lower() for word in good_emotions_tokens if word.isalpha()] bad_emotions_onlywords = [word.lower() for word in bad_emotions_tokens if word.isalpha()] # turn into readable string good_emotions_text = ' '.join(good_emotions_onlywords) bad_emotions_text = ' '.join(bad_emotions_onlywords) # import from wordcloud import WordCloud import matplotlib.pyplot as plt # wordclouds wc1 = WordCloud(width=500, height=500, background_color="white", colormap='BrBG').generate(good_emotions_text) wc2 = WordCloud(width=500, height=500, background_color="black", colormap='BrBG').generate(bad_emotions_text) plt.rcParams["figure.figsize"] = (20,20) plt.subplot(1, 2, 1) plt.imshow(wc1, interpolation="bilinear") plt.title("Good emotions!", fontsize=40) plt.axis("off") plt.subplot(1, 2, 2) plt.imshow(wc2, interpolation="bilinear") plt.title("Bad emotions!", fontsize=40) plt.axis("off") plt.show()

# bag of words as a dictionary data type bow = {} # count the occurrences of each word and save it for word in bad_emotions_onlywords: bow[word] = bad_emotions_onlywords.count(word) # create a sorted list of word-frequency tuples words_frequency = sorted(bow.items(), key=lambda x: x[1], reverse=True) print(words_frequency)

# put the first 6 word counts into a dataframe word_count = pd.DataFrame(words_frequency) word_count.rename(columns={0: "words", 1: "count"}, inplace=True) words_subset=word_count.iloc[0:5] words_subset # let's create a pie chart base = alt.Chart(words_subset).encode( theta=alt.Theta("words", stack=True), radius=alt.Radius("count", scale=alt.Scale(type="sqrt", zero=True, rangeMin=20)), color=alt.Color('words:N', scale=alt.Scale(scheme='brownbluegreen')), tooltip='words' ) c1 = base.mark_arc( innerRadius=20, stroke="#fff" ) c2 = base.mark_text(radiusOffset=10).encode(text="count:Q") # combine charts for some top level cofigurations c3 = c1 + c2 c3.configure_view( strokeWidth=0 ).properties(title = "Most used words in survey answers").configure_title(fontSize=15, offset=20, orient='top', anchor='start')

# read the surveyfile file3 = open("survey_answers.csv") survey_data = pd.read_csv( file3) file3.close() survey_data.tail()

# we need another structure for the df stress_data = {'category': ["not at all", "little", "medium", "much", "very much"], 'stress_general': [0, survey_data['stresslevel'].value_counts()['little'], survey_data['stresslevel'].value_counts()['medium'], survey_data['stresslevel'].value_counts()['much'], survey_data['stresslevel'].value_counts()['very much']], 'stress_covid' : [survey_data['stress_increase_during_covid'].value_counts()['not at all'],survey_data['stress_increase_during_covid'].value_counts()['little'], survey_data['stress_increase_during_covid'].value_counts()['medium'], survey_data['stress_increase_during_covid'].value_counts()['much'], survey_data['stress_increase_during_covid'].value_counts()['very much']] } stress_categories = pd.DataFrame(data=stress_data) stress_categories

source = stress_categories # create visualisations base = alt.Chart(source).encode( alt.X('category:O', sort=['not at all', 'little', 'medium', 'much', 'very much'], axis=alt.Axis(title="intensity")) ).properties(width = 350, height = 350) bar1 = base.mark_bar(opacity=1, color='#BEDAD4').encode( alt.Y('stress_general:Q', axis=alt.Axis(title='how much stress do you feel in your studies?', titleColor='#6FA097')), tooltip=[alt.Tooltip('stress_general', title='stress general')], ) tick1 = base.mark_tick( color='#AF7500', thickness=4, size=40 * 0.9, # controls width of tick. ).encode( alt.Y('stress_covid', axis=alt.Axis(title='stress increase during the Covid-19 pandemic?', titleColor='#AF7500')), tooltip=[alt.Tooltip('stress_covid', title='increase Covid19')], ) # display visualisation layer1 = alt.layer(bar1, tick1).resolve_scale( y = 'independent' ).properties( title='Stress and Covid-19' ).configure_title( fontSize=15, offset=20, orient='top', anchor='middle' ) layer1

# read the excerpt file file4 = open("/work/Student Stress England/excerpt1_studentStressEngland.csv") stress_england = pd.read_csv( file4, sep=";") file4.close() stress_england.head()

# drop unnecessary column stress_england = stress_england.drop(columns = 'Q18')

# get the first row for the header new_header = stress_england.iloc[0] # take the data less the header row stress_england = stress_england[1:] # set the header row as the df header stress_england.columns = new_header # make column names lowercase stress_england.columns= stress_england.columns.str.lower()

stress_england_final["stress"].replace({ "Somewhat": "Sometimes", "To a large extent": "Fairly often", "Completely": "Very often", "To a small extent": "Almost never" }, inplace=True) stress_england_final.tail()

# let's make a savety copy of our dataset, that we use from now on stress_england_final_v2 = stress_england_final # map the String values to integer values stress_england_final_v2["gender"].replace({ "Female": 1, "Male": 2, "Prefer not to say": 3 }, inplace=True) stress_england_final_v2["stress"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) stress_england_final_v2["low energy"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) stress_england_final_v2["headaches"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) stress_england_final_v2["digestion problems"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) stress_england_final_v2["anxiety or tension"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) stress_england_final_v2["sleep problems"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) stress_england_final_v2["rapid heartbeat"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) stress_england_final_v2["irritability"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) stress_england_final_v2["sadness"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) stress_england_final_v2["concentration problems"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) stress_england_final_v2["illness"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) stress_england_final_v2["aches and pains"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) stress_england_final_v2["loneliness"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) #because it doesn't feel right, we rename gender to gender_id stress_england_final_v2 = stress_england_final_v2.rename( columns={"gender": "gender_id" }) stress_england_final_v2.head()

import plotly.graph_objects as go

df = stress_england_final_v2 # figure plot fig = go.Figure(data= go.Parcoords( line_color='#008985', dimensions = list([ dict(range = [0,4], constraintrange = [2,4], label = 'Stress Level', values = df['stress']), dict(range = [0,4], label = 'Low Energy', values = df['low energy']), dict(range = [0,4], label = 'Headaches', values = df['headaches']), dict(range = [0,4], label = 'Digestion Problems', values = df['digestion problems']), dict(range = [0,4], label = 'Anxiety/Tension', values = df['anxiety or tension']), dict(range = [0,4], label = 'Sleep Problems', values = df['sleep problems']) ]) ) ) fig.update_layout( plot_bgcolor = 'white', paper_bgcolor = 'white', title={ 'text': "Stress and other Symptoms", 'y':0.95, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'}, #labels=dict(y="Frequency (0 = Never, 4 = Very often)") ) fig.show()

# drop all the unnecessary colums for this part -> we only want to work with stress and it's sympotoms symptoms_stress_england = stress_england_final_v2.drop(columns = ['gender_id', 'ethnicity', 'age', 'fee status', 'degree', 'study year', 'subject', 'low energy', 'headaches', 'digestion problems', 'anxiety or tension', 'sleep problems', 'rapid heartbeat', 'irritability', 'concentration problems', 'sadness', 'illness', 'aches and pains', 'loneliness', 'coping mechanisms', 'do they work?', 'university support?'] ) symptoms_stress_england.head() # transform all String answers to numbers (necessary for the corr() method) symptoms_stress_england["overloaded with university work"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) symptoms_stress_england["too much time at universiy"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) symptoms_stress_england["competition with peers"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) symptoms_stress_england["difficulties with supervisor or tutor"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) symptoms_stress_england["unpleasant working environment"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) symptoms_stress_england["criticism about work"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) symptoms_stress_england["lack of time for relaxation"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) symptoms_stress_england["difficult home environment"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) symptoms_stress_england["financial issues"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) symptoms_stress_england["lack of confidence with academic performance"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) symptoms_stress_england["lack of confidence with subject or career choice"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) symptoms_stress_england["conflicts between university work and extracurricular employment"].replace({ "Never": 0, "Almost never": 1, "Sometimes": 2, "Fairly often": 3, "Very often" : 4 }, inplace=True) # now let's have a quick look at the dataset symptoms_stress_england.head()

corr = symptoms_stress_england.corr() # first we reset the index and call it dim1 corr = corr.reset_index().rename(columns={'index': 'dim1'}) corr = corr.rename(columns={0: 'dim1'}, index={'ONE': 'Row_1'}) # turn correlation data into long form corr = pd.melt(corr, id_vars='dim1', var_name='dim2', value_name='corr') # add a label column for rounded correlation values corr['label'] = corr['corr'].map('{:.1f}'.format)

corr

# layered chart, with the base taking in the correlation data corr # and the basic layout based on the dimensions base = alt.Chart(corr).encode( x='dim1:O', y='dim2:O' ).properties(width=400, height=400) # a textual layer displaying rounded correlation values text = base.mark_text().encode( text='label' ) # heatmap of the correlation values plot = base.mark_rect().encode( color= alt.Color('corr:Q', scale=alt.Scale(scheme='brownbluegreen')) ) # both layers are combined plot + text

symptoms_stress_england['stress'].value_counts()

# extract all the people, that deal with stress very often stress_level_4 = symptoms_stress_england.loc[symptoms_stress_england['stress'] == 4] # take a sample out of it stress_level_4_sample = stress_level_4.sample(n = 7) # extract all the people, that deal with stress fairly often stress_level_3 = symptoms_stress_england.loc[symptoms_stress_england['stress'] == 3] # take a sample out of it stress_level_3_sample = stress_level_3.sample(n = 7) # extract all the people, that deal with stress sometimes stress_level_2 = symptoms_stress_england.loc[symptoms_stress_england['stress'] == 2] # take a sample out of it stress_level_2_sample = stress_level_2.sample(n = 7) # extract all the people, that deal with stress almost never stress_level_1 = symptoms_stress_england.loc[symptoms_stress_england['stress'] == 1] # here we don't have to take a sample because it's fully used #combine them combined_stress = pd.concat([stress_level_4_sample, stress_level_3_sample, stress_level_2_sample, stress_level_1]) #reset index and rename combined_stress = combined_stress.reset_index().rename(columns={'index': 'student_number'}) #restrict dataset to columns of interest combined_stress = combined_stress[['student_number', 'difficulties with supervisor or tutor', 'stress', 'lack of time for relaxation', 'overloaded with university work', 'lack of confidence with academic performance']] combined_stress

# to coordinate hover highlights we create a selection selection = alt.selection_single(on='mouseover', fields=['student_number']) # the definitions of the base are used by the three sub-charts base = alt.Chart(combined_stress).mark_bar().encode( # adjust opacity based on hover selection opacity=alt.condition(selection, alt.value(1), alt.value(.5)), x = alt.X("student_number:O", sort="-y", axis=None), color=alt.Color('stress', legend=alt.Legend(title="Stresslevel by color"), scale=alt.Scale(scheme='brownbluegreen')), tooltip=['student_number', 'lack of time for relaxation', 'overloaded with university work','lack of confidence with academic performance','difficulties with supervisor or tutor'] ).properties( width=600, height=100 ).add_selection(selection) # create a chart for each dimension multiple1 = base.encode(y = alt.Y("lack of time for relaxation")) multiple2 = base.encode(y = alt.Y("overloaded with university work", title="overloaded with uni work")) multiple3 = base.encode(y = alt.Y("lack of confidence with academic performance", title="not confident with performance")) multiple4 = base.encode(y = alt.Y("difficulties with supervisor or tutor", title="difficulties with tutor")) # combine them with ampersands multiples = multiple1 & multiple2 & multiple3 & multiple4 multiples.properties( title='Stress and possible causes' ).configure_title( fontSize=15, offset=20, orient='top', anchor='middle' )

# In the first step, I limit myself to one question that was asked in all three years # load in dataset 1 file5 = open("/work/Covid-19/Cov19_2020_excerpt.csv") Cov_Set1 = pd.read_csv( file5, sep=";") file5.close() Cov_Set1.head()

# drop the weird unnamed column # delete the question column # (because it's the same question for every year: how has your wellbeing and mental health changed since the last term) Cov_Set1 = Cov_Set1.drop(columns=['Unnamed: 0', 'Unnamed: 7'])

# since the other datasets don't have that sepeation into different student categories, we can drop that # rows here too -> so we have a very short and combinable dataset Cov_Set1 = Cov_Set1.drop(Cov_Set1.index[[1,2,3]])

# add one column for datetime # we use the end date as dateindex endDate1 = pd.to_datetime('25.11.2020', dayfirst=True) Cov_Set1['datetime'] = endDate1 Cov_Set1.head()

# now let's set the datetime as index Cov_Set1.set_index('datetime', inplace=True) Cov_Set1.head()

# load in dataset 2 file6 = open("/work/Covid-19/Cov19_2021_excerpt.csv") Cov_Set2 = pd.read_csv( file6, sep=";") file6.close() Cov_Set2.head()

# here we don't have a samplesize column but we can read it out from the excel sheet # add that column Cov_Set2['Samplesize'] = 1110 # drop question colunm Cov_Set2 = Cov_Set2.drop(columns=['Unnamed: 0']) # add one column for datetime endDate2 = pd.to_datetime('02.06.2021', dayfirst=True) Cov_Set2['datetime'] = endDate2 # now let's set the datetime as index Cov_Set2.set_index('datetime', inplace=True) Cov_Set2.head()

# load in dataset 3 file7 = open("/work/Covid-19/Cov19_2022_excerpt.csv") Cov_Set3 = pd.read_csv( file7, sep=";") file7.close() Cov_Set3.head()

# here we drop that random rows Cov_Set3.dropna(inplace=True) # drop question colunm Cov_Set3 = Cov_Set3.drop(columns=['Unnamed: 0']) # add one column for datetime endDate3 = pd.to_datetime('07.03.2022', dayfirst=True) Cov_Set3['datetime'] = endDate3 # now let's set the datetime as index Cov_Set3.set_index('datetime', inplace=True) Cov_Set3.head()

# load in dataset 4 file8 = open("/work/Covid-19/Cov19_18.01.2021_excerpt.csv") Cov_Set4 = pd.read_csv( file8, sep=";") file8.close() # all the operations again Cov_Set4 = Cov_Set4.drop(columns=['Unnamed: 8', 'Unnamed: 0']) Cov_Set4.dropna(inplace=True) endDate4 = pd.to_datetime('18.01.2021', dayfirst=True) Cov_Set4['datetime'] = endDate4 Cov_Set4.set_index('datetime', inplace=True)

# load in dataset 5 file9 = open("/work/Covid-19/Cov19_01.03.2021_excerpt.csv") Cov_Set5 = pd.read_csv( file9, sep=";") file9.close() # all the operations again Cov_Set5 = Cov_Set5.drop(columns=['Unnamed: 0']) Cov_Set5.dropna(inplace=True) endDate5 = pd.to_datetime('01.03.2021', dayfirst=True) Cov_Set5['datetime'] = endDate5 Cov_Set5.set_index('datetime', inplace=True)

# load in dataset 6 file10 = open("/work/Covid-19/Cov19_22.03.2021_excerpt.csv") Cov_Set6 = pd.read_csv( file10, sep=";") file10.close() # all the operations again Cov_Set6 = Cov_Set6.drop(columns=['Unnamed: 0']) Cov_Set6.dropna(inplace=True) endDate6 = pd.to_datetime('22.03.2021', dayfirst=True) Cov_Set6['datetime'] = endDate6 Cov_Set6.set_index('datetime', inplace=True)

# load in dataset 7 file11 = open("/work/Covid-19/Cov19_22.04.2021_excerpt.csv") Cov_Set7 = pd.read_csv( file11, sep=";") file11.close() # all the operations again Cov_Set7 = Cov_Set7.drop(columns=['Unnamed: 0']) Cov_Set7.dropna(inplace=True) endDate7 = pd.to_datetime('22.04.2021', dayfirst=True) Cov_Set7['datetime'] = endDate7 Cov_Set7.set_index('datetime', inplace=True)

# load in dataset 8 file12 = open("/work/Covid-19/Cov19_12.05.2021_excerpt.csv") Cov_Set8 = pd.read_csv( file12, sep=";") file12.close() # all the operations again Cov_Set8 = Cov_Set8.drop(columns=['Unnamed: 0']) Cov_Set8.dropna(inplace=True) endDate8 = pd.to_datetime('12.05.2021', dayfirst=True) Cov_Set8['datetime'] = endDate8 Cov_Set8.set_index('datetime', inplace=True)

# load in dataset 9 file13 = open("/work/Covid-19/Cov19_04.10.2021_excerpt.csv") Cov_Set9 = pd.read_csv( file13, sep=";") file13.close() # all the operations again Cov_Set9= Cov_Set9.drop(columns=['Unnamed: 0']) Cov_Set9.dropna(inplace=True) endDate9 = pd.to_datetime('04.10.2021', dayfirst=True) Cov_Set9['datetime'] = endDate9 Cov_Set9.set_index('datetime', inplace=True)

# load in dataset 10 file14 = open("/work/Covid-19/Cov19_01.11.2021_excerpt.csv") Cov_Set10 = pd.read_csv( file14, sep=";") file14.close() # all the operations again Cov_Set10= Cov_Set10.drop(columns=['Unnamed: 0']) Cov_Set10.dropna(inplace=True) endDate10 = pd.to_datetime('01.11.2021', dayfirst=True) Cov_Set10['datetime'] = endDate10 Cov_Set10.set_index('datetime', inplace=True)

# load in dataset 11 file15 = open("/work/Covid-19/Cov19_15.11.2021_excerpt.csv") Cov_Set11 = pd.read_csv( file15, sep=";") file15.close() # all the operations again Cov_Set11= Cov_Set11.drop(columns=['Unnamed: 0', 'Unnamed: 8', 'Unnamed: 9']) Cov_Set11.dropna(inplace=True) endDate11 = pd.to_datetime('15.11.2021', dayfirst=True) Cov_Set11['datetime'] = endDate11 Cov_Set11.set_index('datetime', inplace=True)

# load in dataset 12 file16 = open("/work/Covid-19/Cov19_29.11.2021_excerpt.csv") Cov_Set12 = pd.read_csv( file16, sep=";") file16.close() # all the operations again Cov_Set12= Cov_Set12.drop(columns=['Unnamed: 0']) Cov_Set12.dropna(inplace=True) endDate12 = pd.to_datetime('29.11.2021', dayfirst=True) Cov_Set12['datetime'] = endDate12 Cov_Set12.set_index('datetime', inplace=True)

# combine all sets frames = [Cov_Set1, Cov_Set2, Cov_Set3, Cov_Set4, Cov_Set5, Cov_Set6, Cov_Set7, Cov_Set8, Cov_Set9, Cov_Set10, Cov_Set11, Cov_Set12] Cov_Set_combined = pd.concat(frames) # we don't neet the Prefer not to say columns, so we gonna delete them Cov_Set_combined = Cov_Set_combined.drop(Cov_Set_combined.columns[[5, 7]],axis = 1) Cov_Set_combined

# we need to drop the samplesize value Cov_Set_withoutSamplesize = Cov_Set_combined.drop(columns=['Samplesize']) # sum over weeks and days and create new datasets months = Cov_Set_withoutSamplesize.resample("M").sum() # initialize the selection brush = alt.selection(type='interval', encodings=['x']) # create a diagram which is smoothed over the months and shows value changes over time. The selection can be set here by interacting with diagram upper = alt.Chart(months.reset_index().melt("datetime")).mark_area(interpolate = "basis").encode( x = alt.X('datetime:T', axis=None), y = alt.Y('value:Q', axis=None), color= alt.Color('variable', scale=alt.Scale(scheme='brownbluegreen'), title='change of well-being', legend=alt.Legend(orient="bottom")), ).properties(width=800, height=100).add_selection(brush) # create a diagram which changes it's range based on the selection lines = alt.Chart(Cov_Set_withoutSamplesize.reset_index().melt("datetime")).mark_line(strokeWidth=2).encode( x = alt.X('datetime:T', scale=alt.Scale(domain=brush), title='datetime'), y = alt.Y('value', title='amount (%)'), color='variable', ).properties(width=800, height=300) data_lines = pd.DataFrame({ 'value': Cov_Set_withoutSamplesize.index.values }) #vertical lines for exact data collection days vertlines = alt.Chart(data_lines).mark_rule( strokeWidth= 0.7, color = 'black' ).encode(x=alt.X('value', scale=alt.Scale(domain=brush))) # print upper and lower diagram together in one frame (lower below upper) display = upper & (lines + vertlines) display.properties( title='Change of well-being and mental health' ).configure_title(fontSize=15, offset=12, orient='top', anchor='middle')