# import tools
# general tools
import altair as alt
import pandas as pd
import requests
# geovisualisation tools
import geopandas as gpd
from OSMPythonTools.overpass import overpassQueryBuilder
from OSMPythonTools.nominatim import Nominatim
nominatim = Nominatim()
from OSMPythonTools.overpass import Overpass
overpass = Overpass()
from shapely.ops import orient
# text processing tools
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker' )
nltk.download('omw-1.4')
nltk.download('words')
#!pip install SpaCy
#!python -m spacy download en_core_web_sm
#!pip install wordcloud
#!pip install ipywidgets
from nltk.stem import PorterStemmer as stemmer
from nltk.stem import WordNetLemmatizer as lemmatizer
from nltk.corpus import wordnet # for robust lemmatization
# find the universities of my survey friends
# create a Dataframe with all the university names
concrete_universities = pd.DataFrame({
"name": ["Universität Potsdam", "Fachhochschule Potsdam", "Freie Universität Berlin", "Universität Stuttgart", "Julius-Maximilians-Universität Würzburg", "Universität Regensburg", "Hochschule Heilbronn Campus Sontheim"],
"city": ["Potsdam", "Potsdam", "Berlin", "Stuttgart", "Würzburg", "Regensburg", "Heilbronn"],
"address": ["Am Neuen Palais 10, 14469 Potsdam", "Kiepenheuerallee 5, 14469 Potsdam", "Kaiserswerther Str. 16, 14195 Berlin", "Keplerstraße 7, 70174 Stuttgart","Sanderring 2, 97070 Würzburg", "Universitätsstraße 31, 93053 Regensburg", "Max-Planck-Straße 39, 74081 Heilbronn"],
"amount": [3, 4, 1, 1, 2, 3, 1]
}
)
# import Nominatim as geopy geocoder
from geopy.geocoders import Nominatim
# register custom user agent (commercial services may also require an API key)
geocoder = Nominatim(user_agent="Information Visualization Tutorial · FH Potsdam")
# concrete_universities
from geopy.extra.rate_limiter import RateLimiter
# add a delay of one second between each geocoding request
geocode = RateLimiter(geocoder.geocode, min_delay_seconds=1)
# apply geocoding to address column; store responses in location column
concrete_universities['location'] = concrete_universities['address'].apply(geocode)
# create empty columns for coordinates
concrete_universities.loc[:, "lat"] = None
concrete_universities.loc[:, "lon"] = None
# extract lat and lon from locations via one list comprehensions
concrete_universities.loc[:, ['lat', 'lon']] = [ (loc.latitude, loc.longitude) for loc in concrete_universities['location'] ]
# create GeoDataFrame, pointing explicitly to lon and lat columns
concrete_universities = gpd.GeoDataFrame(concrete_universities, geometry=gpd.points_from_xy(concrete_universities.lon, concrete_universities.lat))
# remove superfluous columns that are not needed anymore
concrete_universities = concrete_universities.drop(columns=['location', 'lat', 'lon'])
concrete_universities
# visualize
germany_map = gpd.read_file('/work/bundeslaender_simplify20.geojson')
germany_map = germany_map.geometry.apply(orient, args=(-1,))
# basemap
basemap_germany = alt.Chart(germany_map).mark_geoshape(
fill="#F7F3dF", stroke="#C4AE7C"
).properties(width=500, height=500)
# from shapely.ops import orient # version >=1.7a2
# markers
markers = alt.Chart(concrete_universities).mark_circle(opacity=1).encode(
longitude='geometry.coordinates[0]:Q',
latitude='geometry.coordinates[1]:Q',
size = "amount",
color=alt.value('#6FA097'),
tooltip=['name:N']
)
# basemap_germany
where_are_they = basemap_germany + markers
where_are_they.properties(
title = "Where are my participants?"
).configure_title(
fontSize=15, offset=20, orient='top', anchor='middle')
#import texts
file1 = open("good_emotions.txt")
good_emotions = file1.read()
file1.close()
file2 = open("bad_emotions.txt")
bad_emotions = file2.read()
file2.close()
# take a look at their written emotions
print("Good emotions: ")
print(" ")
print(good_emotions)
print(" ")
print("Bad emotions: ")
print(" ")
print(bad_emotions)
# NOTE: PLEASE LOAD THIS CODE TWICE
# tokenize
good_emotions_tokens = nltk.word_tokenize(good_emotions)
bad_emotions_tokens = nltk.word_tokenize(bad_emotions)
# onlywords
good_emotions_onlywords = [word.lower() for word in good_emotions_tokens if word.isalpha()]
bad_emotions_onlywords = [word.lower() for word in bad_emotions_tokens if word.isalpha()]
# turn into readable string
good_emotions_text = ' '.join(good_emotions_onlywords)
bad_emotions_text = ' '.join(bad_emotions_onlywords)
# import
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# wordclouds
wc1 = WordCloud(width=500, height=500, background_color="white", colormap='BrBG').generate(good_emotions_text)
wc2 = WordCloud(width=500, height=500, background_color="black", colormap='BrBG').generate(bad_emotions_text)
plt.rcParams["figure.figsize"] = (20,20)
plt.subplot(1, 2, 1)
plt.imshow(wc1, interpolation="bilinear")
plt.title("Good emotions!", fontsize=40)
plt.axis("off")
plt.subplot(1, 2, 2)
plt.imshow(wc2, interpolation="bilinear")
plt.title("Bad emotions!", fontsize=40)
plt.axis("off")
plt.show()
# bag of words as a dictionary data type
bow = {}
# count the occurrences of each word and save it
for word in bad_emotions_onlywords:
bow[word] = bad_emotions_onlywords.count(word)
# create a sorted list of word-frequency tuples
words_frequency = sorted(bow.items(), key=lambda x: x[1], reverse=True)
print(words_frequency)
# put the first 6 word counts into a dataframe
word_count = pd.DataFrame(words_frequency)
word_count.rename(columns={0: "words", 1: "count"}, inplace=True)
words_subset=word_count.iloc[0:5]
words_subset
# let's create a pie chart
base = alt.Chart(words_subset).encode(
theta=alt.Theta("words", stack=True),
radius=alt.Radius("count", scale=alt.Scale(type="sqrt", zero=True, rangeMin=20)),
color=alt.Color('words:N', scale=alt.Scale(scheme='brownbluegreen')),
tooltip='words'
)
c1 = base.mark_arc(
innerRadius=20,
stroke="#fff"
)
c2 = base.mark_text(radiusOffset=10).encode(text="count:Q")
# combine charts for some top level cofigurations
c3 = c1 + c2
c3.configure_view(
strokeWidth=0
).properties(title = "Most used words in survey answers").configure_title(fontSize=15, offset=20, orient='top', anchor='start')
# read the surveyfile
file3 = open("survey_answers.csv")
survey_data = pd.read_csv(
file3)
file3.close()
survey_data.tail()
# we need another structure for the df
stress_data = {'category': ["not at all", "little", "medium", "much", "very much"],
'stress_general': [0, survey_data['stresslevel'].value_counts()['little'], survey_data['stresslevel'].value_counts()['medium'], survey_data['stresslevel'].value_counts()['much'], survey_data['stresslevel'].value_counts()['very much']],
'stress_covid' : [survey_data['stress_increase_during_covid'].value_counts()['not at all'],survey_data['stress_increase_during_covid'].value_counts()['little'], survey_data['stress_increase_during_covid'].value_counts()['medium'], survey_data['stress_increase_during_covid'].value_counts()['much'], survey_data['stress_increase_during_covid'].value_counts()['very much']]
}
stress_categories = pd.DataFrame(data=stress_data)
stress_categories
source = stress_categories
# create visualisations
base = alt.Chart(source).encode(
alt.X('category:O', sort=['not at all', 'little', 'medium', 'much', 'very much'], axis=alt.Axis(title="intensity"))
).properties(width = 350, height = 350)
bar1 = base.mark_bar(opacity=1, color='#BEDAD4').encode(
alt.Y('stress_general:Q',
axis=alt.Axis(title='how much stress do you feel in your studies?', titleColor='#6FA097')),
tooltip=[alt.Tooltip('stress_general', title='stress general')],
)
tick1 = base.mark_tick(
color='#AF7500',
thickness=4,
size=40 * 0.9, # controls width of tick.
).encode(
alt.Y('stress_covid',
axis=alt.Axis(title='stress increase during the Covid-19 pandemic?', titleColor='#AF7500')),
tooltip=[alt.Tooltip('stress_covid', title='increase Covid19')],
)
# display visualisation
layer1 = alt.layer(bar1, tick1).resolve_scale(
y = 'independent'
).properties(
title='Stress and Covid-19'
).configure_title(
fontSize=15, offset=20, orient='top', anchor='middle'
)
layer1
# read the excerpt file
file4 = open("/work/Student Stress England/excerpt1_studentStressEngland.csv")
stress_england = pd.read_csv(
file4,
sep=";")
file4.close()
stress_england.head()
# drop unnecessary column
stress_england = stress_england.drop(columns = 'Q18')
# get the first row for the header
new_header = stress_england.iloc[0]
# take the data less the header row
stress_england = stress_england[1:]
# set the header row as the df header
stress_england.columns = new_header
# make column names lowercase
stress_england.columns= stress_england.columns.str.lower()
stress_england_final["stress"].replace({
"Somewhat": "Sometimes",
"To a large extent": "Fairly often",
"Completely": "Very often",
"To a small extent": "Almost never"
}, inplace=True)
stress_england_final.tail()
# let's make a savety copy of our dataset, that we use from now on
stress_england_final_v2 = stress_england_final
# map the String values to integer values
stress_england_final_v2["gender"].replace({
"Female": 1,
"Male": 2,
"Prefer not to say": 3
}, inplace=True)
stress_england_final_v2["stress"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
stress_england_final_v2["low energy"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
stress_england_final_v2["headaches"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
stress_england_final_v2["digestion problems"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
stress_england_final_v2["anxiety or tension"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
stress_england_final_v2["sleep problems"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
stress_england_final_v2["rapid heartbeat"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
stress_england_final_v2["irritability"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
stress_england_final_v2["sadness"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
stress_england_final_v2["concentration problems"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
stress_england_final_v2["illness"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
stress_england_final_v2["aches and pains"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
stress_england_final_v2["loneliness"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
#because it doesn't feel right, we rename gender to gender_id
stress_england_final_v2 = stress_england_final_v2.rename(
columns={"gender": "gender_id"
})
stress_england_final_v2.head()
import plotly.graph_objects as go
df = stress_england_final_v2
# figure plot
fig = go.Figure(data=
go.Parcoords(
line_color='#008985',
dimensions = list([
dict(range = [0,4],
constraintrange = [2,4],
label = 'Stress Level', values = df['stress']),
dict(range = [0,4],
label = 'Low Energy', values = df['low energy']),
dict(range = [0,4],
label = 'Headaches', values = df['headaches']),
dict(range = [0,4],
label = 'Digestion Problems', values = df['digestion problems']),
dict(range = [0,4],
label = 'Anxiety/Tension', values = df['anxiety or tension']),
dict(range = [0,4],
label = 'Sleep Problems', values = df['sleep problems'])
])
)
)
fig.update_layout(
plot_bgcolor = 'white',
paper_bgcolor = 'white',
title={
'text': "Stress and other Symptoms",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
#labels=dict(y="Frequency (0 = Never, 4 = Very often)")
)
fig.show()
# drop all the unnecessary colums for this part -> we only want to work with stress and it's sympotoms
symptoms_stress_england = stress_england_final_v2.drop(columns = ['gender_id', 'ethnicity',
'age',
'fee status',
'degree',
'study year',
'subject',
'low energy',
'headaches',
'digestion problems',
'anxiety or tension',
'sleep problems',
'rapid heartbeat',
'irritability',
'concentration problems',
'sadness',
'illness',
'aches and pains',
'loneliness',
'coping mechanisms',
'do they work?',
'university support?']
)
symptoms_stress_england.head()
# transform all String answers to numbers (necessary for the corr() method)
symptoms_stress_england["overloaded with university work"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
symptoms_stress_england["too much time at universiy"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
symptoms_stress_england["competition with peers"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
symptoms_stress_england["difficulties with supervisor or tutor"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
symptoms_stress_england["unpleasant working environment"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
symptoms_stress_england["criticism about work"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
symptoms_stress_england["lack of time for relaxation"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
symptoms_stress_england["difficult home environment"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
symptoms_stress_england["financial issues"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
symptoms_stress_england["lack of confidence with academic performance"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
symptoms_stress_england["lack of confidence with subject or career choice"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
symptoms_stress_england["conflicts between university work and extracurricular employment"].replace({
"Never": 0,
"Almost never": 1,
"Sometimes": 2,
"Fairly often": 3,
"Very often" : 4
}, inplace=True)
# now let's have a quick look at the dataset
symptoms_stress_england.head()
corr = symptoms_stress_england.corr()
# first we reset the index and call it dim1
corr = corr.reset_index().rename(columns={'index': 'dim1'})
corr = corr.rename(columns={0: 'dim1'}, index={'ONE': 'Row_1'})
# turn correlation data into long form
corr = pd.melt(corr, id_vars='dim1', var_name='dim2', value_name='corr')
# add a label column for rounded correlation values
corr['label'] = corr['corr'].map('{:.1f}'.format)
corr
# layered chart, with the base taking in the correlation data corr
# and the basic layout based on the dimensions
base = alt.Chart(corr).encode(
x='dim1:O',
y='dim2:O'
).properties(width=400, height=400)
# a textual layer displaying rounded correlation values
text = base.mark_text().encode( text='label' )
# heatmap of the correlation values
plot = base.mark_rect().encode(
color= alt.Color('corr:Q', scale=alt.Scale(scheme='brownbluegreen'))
)
# both layers are combined
plot + text
symptoms_stress_england['stress'].value_counts()
# extract all the people, that deal with stress very often
stress_level_4 = symptoms_stress_england.loc[symptoms_stress_england['stress'] == 4]
# take a sample out of it
stress_level_4_sample = stress_level_4.sample(n = 7)
# extract all the people, that deal with stress fairly often
stress_level_3 = symptoms_stress_england.loc[symptoms_stress_england['stress'] == 3]
# take a sample out of it
stress_level_3_sample = stress_level_3.sample(n = 7)
# extract all the people, that deal with stress sometimes
stress_level_2 = symptoms_stress_england.loc[symptoms_stress_england['stress'] == 2]
# take a sample out of it
stress_level_2_sample = stress_level_2.sample(n = 7)
# extract all the people, that deal with stress almost never
stress_level_1 = symptoms_stress_england.loc[symptoms_stress_england['stress'] == 1]
# here we don't have to take a sample because it's fully used
#combine them
combined_stress = pd.concat([stress_level_4_sample, stress_level_3_sample, stress_level_2_sample, stress_level_1])
#reset index and rename
combined_stress = combined_stress.reset_index().rename(columns={'index': 'student_number'})
#restrict dataset to columns of interest
combined_stress = combined_stress[['student_number', 'difficulties with supervisor or tutor', 'stress', 'lack of time for relaxation', 'overloaded with university work', 'lack of confidence with academic performance']]
combined_stress
# to coordinate hover highlights we create a selection
selection = alt.selection_single(on='mouseover', fields=['student_number'])
# the definitions of the base are used by the three sub-charts
base = alt.Chart(combined_stress).mark_bar().encode(
# adjust opacity based on hover selection
opacity=alt.condition(selection, alt.value(1), alt.value(.5)),
x = alt.X("student_number:O", sort="-y", axis=None),
color=alt.Color('stress', legend=alt.Legend(title="Stresslevel by color"), scale=alt.Scale(scheme='brownbluegreen')),
tooltip=['student_number', 'lack of time for relaxation', 'overloaded with university work','lack of confidence with academic performance','difficulties with supervisor or tutor']
).properties(
width=600, height=100
).add_selection(selection)
# create a chart for each dimension
multiple1 = base.encode(y = alt.Y("lack of time for relaxation"))
multiple2 = base.encode(y = alt.Y("overloaded with university work", title="overloaded with uni work"))
multiple3 = base.encode(y = alt.Y("lack of confidence with academic performance", title="not confident with performance"))
multiple4 = base.encode(y = alt.Y("difficulties with supervisor or tutor", title="difficulties with tutor"))
# combine them with ampersands
multiples = multiple1 & multiple2 & multiple3 & multiple4
multiples.properties(
title='Stress and possible causes'
).configure_title(
fontSize=15, offset=20, orient='top', anchor='middle'
)
# In the first step, I limit myself to one question that was asked in all three years
# load in dataset 1
file5 = open("/work/Covid-19/Cov19_2020_excerpt.csv")
Cov_Set1 = pd.read_csv(
file5,
sep=";")
file5.close()
Cov_Set1.head()
# drop the weird unnamed column
# delete the question column
# (because it's the same question for every year: how has your wellbeing and mental health changed since the last term)
Cov_Set1 = Cov_Set1.drop(columns=['Unnamed: 0', 'Unnamed: 7'])
# since the other datasets don't have that sepeation into different student categories, we can drop that
# rows here too -> so we have a very short and combinable dataset
Cov_Set1 = Cov_Set1.drop(Cov_Set1.index[[1,2,3]])
# add one column for datetime
# we use the end date as dateindex
endDate1 = pd.to_datetime('25.11.2020', dayfirst=True)
Cov_Set1['datetime'] = endDate1
Cov_Set1.head()
# now let's set the datetime as index
Cov_Set1.set_index('datetime', inplace=True)
Cov_Set1.head()
# load in dataset 2
file6 = open("/work/Covid-19/Cov19_2021_excerpt.csv")
Cov_Set2 = pd.read_csv(
file6,
sep=";")
file6.close()
Cov_Set2.head()
# here we don't have a samplesize column but we can read it out from the excel sheet
# add that column
Cov_Set2['Samplesize'] = 1110
# drop question colunm
Cov_Set2 = Cov_Set2.drop(columns=['Unnamed: 0'])
# add one column for datetime
endDate2 = pd.to_datetime('02.06.2021', dayfirst=True)
Cov_Set2['datetime'] = endDate2
# now let's set the datetime as index
Cov_Set2.set_index('datetime', inplace=True)
Cov_Set2.head()
# load in dataset 3
file7 = open("/work/Covid-19/Cov19_2022_excerpt.csv")
Cov_Set3 = pd.read_csv(
file7,
sep=";")
file7.close()
Cov_Set3.head()
# here we drop that random rows
Cov_Set3.dropna(inplace=True)
# drop question colunm
Cov_Set3 = Cov_Set3.drop(columns=['Unnamed: 0'])
# add one column for datetime
endDate3 = pd.to_datetime('07.03.2022', dayfirst=True)
Cov_Set3['datetime'] = endDate3
# now let's set the datetime as index
Cov_Set3.set_index('datetime', inplace=True)
Cov_Set3.head()
# load in dataset 4
file8 = open("/work/Covid-19/Cov19_18.01.2021_excerpt.csv")
Cov_Set4 = pd.read_csv(
file8,
sep=";")
file8.close()
# all the operations again
Cov_Set4 = Cov_Set4.drop(columns=['Unnamed: 8', 'Unnamed: 0'])
Cov_Set4.dropna(inplace=True)
endDate4 = pd.to_datetime('18.01.2021', dayfirst=True)
Cov_Set4['datetime'] = endDate4
Cov_Set4.set_index('datetime', inplace=True)
# load in dataset 5
file9 = open("/work/Covid-19/Cov19_01.03.2021_excerpt.csv")
Cov_Set5 = pd.read_csv(
file9,
sep=";")
file9.close()
# all the operations again
Cov_Set5 = Cov_Set5.drop(columns=['Unnamed: 0'])
Cov_Set5.dropna(inplace=True)
endDate5 = pd.to_datetime('01.03.2021', dayfirst=True)
Cov_Set5['datetime'] = endDate5
Cov_Set5.set_index('datetime', inplace=True)
# load in dataset 6
file10 = open("/work/Covid-19/Cov19_22.03.2021_excerpt.csv")
Cov_Set6 = pd.read_csv(
file10,
sep=";")
file10.close()
# all the operations again
Cov_Set6 = Cov_Set6.drop(columns=['Unnamed: 0'])
Cov_Set6.dropna(inplace=True)
endDate6 = pd.to_datetime('22.03.2021', dayfirst=True)
Cov_Set6['datetime'] = endDate6
Cov_Set6.set_index('datetime', inplace=True)
# load in dataset 7
file11 = open("/work/Covid-19/Cov19_22.04.2021_excerpt.csv")
Cov_Set7 = pd.read_csv(
file11,
sep=";")
file11.close()
# all the operations again
Cov_Set7 = Cov_Set7.drop(columns=['Unnamed: 0'])
Cov_Set7.dropna(inplace=True)
endDate7 = pd.to_datetime('22.04.2021', dayfirst=True)
Cov_Set7['datetime'] = endDate7
Cov_Set7.set_index('datetime', inplace=True)
# load in dataset 8
file12 = open("/work/Covid-19/Cov19_12.05.2021_excerpt.csv")
Cov_Set8 = pd.read_csv(
file12,
sep=";")
file12.close()
# all the operations again
Cov_Set8 = Cov_Set8.drop(columns=['Unnamed: 0'])
Cov_Set8.dropna(inplace=True)
endDate8 = pd.to_datetime('12.05.2021', dayfirst=True)
Cov_Set8['datetime'] = endDate8
Cov_Set8.set_index('datetime', inplace=True)
# load in dataset 9
file13 = open("/work/Covid-19/Cov19_04.10.2021_excerpt.csv")
Cov_Set9 = pd.read_csv(
file13,
sep=";")
file13.close()
# all the operations again
Cov_Set9= Cov_Set9.drop(columns=['Unnamed: 0'])
Cov_Set9.dropna(inplace=True)
endDate9 = pd.to_datetime('04.10.2021', dayfirst=True)
Cov_Set9['datetime'] = endDate9
Cov_Set9.set_index('datetime', inplace=True)
# load in dataset 10
file14 = open("/work/Covid-19/Cov19_01.11.2021_excerpt.csv")
Cov_Set10 = pd.read_csv(
file14,
sep=";")
file14.close()
# all the operations again
Cov_Set10= Cov_Set10.drop(columns=['Unnamed: 0'])
Cov_Set10.dropna(inplace=True)
endDate10 = pd.to_datetime('01.11.2021', dayfirst=True)
Cov_Set10['datetime'] = endDate10
Cov_Set10.set_index('datetime', inplace=True)
# load in dataset 11
file15 = open("/work/Covid-19/Cov19_15.11.2021_excerpt.csv")
Cov_Set11 = pd.read_csv(
file15,
sep=";")
file15.close()
# all the operations again
Cov_Set11= Cov_Set11.drop(columns=['Unnamed: 0', 'Unnamed: 8', 'Unnamed: 9'])
Cov_Set11.dropna(inplace=True)
endDate11 = pd.to_datetime('15.11.2021', dayfirst=True)
Cov_Set11['datetime'] = endDate11
Cov_Set11.set_index('datetime', inplace=True)
# load in dataset 12
file16 = open("/work/Covid-19/Cov19_29.11.2021_excerpt.csv")
Cov_Set12 = pd.read_csv(
file16,
sep=";")
file16.close()
# all the operations again
Cov_Set12= Cov_Set12.drop(columns=['Unnamed: 0'])
Cov_Set12.dropna(inplace=True)
endDate12 = pd.to_datetime('29.11.2021', dayfirst=True)
Cov_Set12['datetime'] = endDate12
Cov_Set12.set_index('datetime', inplace=True)
# combine all sets
frames = [Cov_Set1, Cov_Set2, Cov_Set3, Cov_Set4, Cov_Set5, Cov_Set6, Cov_Set7, Cov_Set8, Cov_Set9, Cov_Set10, Cov_Set11, Cov_Set12]
Cov_Set_combined = pd.concat(frames)
# we don't neet the Prefer not to say columns, so we gonna delete them
Cov_Set_combined = Cov_Set_combined.drop(Cov_Set_combined.columns[[5, 7]],axis = 1)
Cov_Set_combined
# we need to drop the samplesize value
Cov_Set_withoutSamplesize = Cov_Set_combined.drop(columns=['Samplesize'])
# sum over weeks and days and create new datasets
months = Cov_Set_withoutSamplesize.resample("M").sum()
# initialize the selection
brush = alt.selection(type='interval', encodings=['x'])
# create a diagram which is smoothed over the months and shows value changes over time. The selection can be set here by interacting with diagram
upper = alt.Chart(months.reset_index().melt("datetime")).mark_area(interpolate = "basis").encode(
x = alt.X('datetime:T', axis=None),
y = alt.Y('value:Q', axis=None),
color= alt.Color('variable', scale=alt.Scale(scheme='brownbluegreen'), title='change of well-being',
legend=alt.Legend(orient="bottom")),
).properties(width=800, height=100).add_selection(brush)
# create a diagram which changes it's range based on the selection
lines = alt.Chart(Cov_Set_withoutSamplesize.reset_index().melt("datetime")).mark_line(strokeWidth=2).encode(
x = alt.X('datetime:T', scale=alt.Scale(domain=brush), title='datetime'),
y = alt.Y('value', title='amount (%)'),
color='variable',
).properties(width=800, height=300)
data_lines = pd.DataFrame({
'value': Cov_Set_withoutSamplesize.index.values
})
#vertical lines for exact data collection days
vertlines = alt.Chart(data_lines).mark_rule(
strokeWidth= 0.7,
color = 'black'
).encode(x=alt.X('value', scale=alt.Scale(domain=brush)))
# print upper and lower diagram together in one frame (lower below upper)
display = upper & (lines + vertlines)
display.properties(
title='Change of well-being and mental health'
).configure_title(fontSize=15, offset=12, orient='top', anchor='middle')