!pip install seaborn==0.11.0
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore')
Kaggle=pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv")
Kaggle.drop([0],axis=0,inplace=True)
Kaggle['time'] = Kaggle['Time from Start to Finish (seconds)'].astype(int)
Kaggle.drop("Time from Start to Finish (seconds)",axis=1,inplace=True)
Kaggle['time'] = pd.to_datetime(Kaggle['time'], unit='s').dt.time
first_col=Kaggle.pop('time')
Kaggle.insert(0, 'time', first_col)
Kaggle.set_index('time',inplace=True)
Kaggle["Year"]="2020"
Kaggle.head(3)
Kaggle19=pd.read_csv("../input/kaggle-survey-2019/multiple_choice_responses.csv")
Kaggle19.drop([0],axis=0,inplace=True)
Kaggle19['time'] = Kaggle19['Time from Start to Finish (seconds)'].astype(int)
Kaggle19.drop("Time from Start to Finish (seconds)",axis=1,inplace=True)
Kaggle19['time'] = pd.to_datetime(Kaggle19['time'], unit='s').dt.time
first_col=Kaggle19.pop('time')
Kaggle19.insert(0, 'time', first_col)
Kaggle19.set_index('time',inplace=True)
Kaggle19["Year"]="2019"
Kaggle19.head(3)
Kaggle18=pd.read_csv("../input/kaggle-survey-2018/multipleChoiceResponses.csv")
Kaggle18.drop([0],axis=0,inplace=True)
Kaggle18['time'] = Kaggle18['Time from Start to Finish (seconds)'].astype(int)
Kaggle18.drop("Time from Start to Finish (seconds)",axis=1,inplace=True)
Kaggle18['time'] = pd.to_datetime(Kaggle18['time'], unit='s').dt.time
first_col=Kaggle18.pop('time')
Kaggle18.insert(0, 'time', first_col)
Kaggle18.set_index('time',inplace=True)
Kaggle18["Year"]="2018"
Kaggle18.head(3)
Kaggle_NDegree=Kaggle[(Kaggle.Q4 != "Doctoral degree") &
(Kaggle.Q4 != "Master’s degree") &
(Kaggle.Q4 != "Bachelor’s degree")&
(Kaggle.Q4 != "Professional degree")]
Kaggle_WDegree=Kaggle[(Kaggle.Q4 == "Doctoral degree") |
(Kaggle.Q4 == "Master’s degree") |
(Kaggle.Q4 == "Bachelor’s degree")|
(Kaggle.Q4 == "Professional degree")]
Kaggle_WDegree.head(3);
Kaggle_NDegree.head(3)
D=Kaggle
D.Q4[(D.Q4 == "Doctoral degree") |
(D.Q4 == "Master’s degree") |
(D.Q4 == "Bachelor’s degree")|
(D.Q4 == "Professional degree")] = 'With Degree'
D.Q4[D.Q4 != "With Degree"] = 'Without Degree'
fig, ax = plt.subplots(1,1,figsize=(15,5))
ax1=sns.histplot(D.sort_values(by="Q1"), x="Q1", kde=True, hue='Q4', palette="viridis")
ax1.set_title('Age Group with and without Degree',fontsize=16, fontweight='bold')
ax1.set(xlabel='',ylabel="Age Group");
catD=D.groupby("Q1")["Q4"].value_counts().unstack()
catD["With Degree %"]= ((catD["With Degree"]/sum(catD["With Degree"]))*100).round(1)
catD["Without Degree %"]= ((catD["Without Degree"]/sum(catD["Without Degree"]))*100).round(1)
catD.sort_index(inplace=True)
catD
import matplotlib.ticker as mtick
catNew=catD[["With Degree %","Without Degree %"]].stack()
catNew=pd.DataFrame(catNew).reset_index().rename(columns={0:"count"})
fig, ax = plt.subplots(1,1,figsize=(15,5))
ax1=sns.barplot( data=catNew.sort_values(by="Q1"),x='Q1',y='count', hue='Q4', palette="viridis")
ax1.set_title('Age Group with and without Degree Normalized',fontsize=16, fontweight='bold')
ax1.set(xlabel='Age Group',ylabel="")
ax1.legend(title="");
ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'));
f, axes = plt.subplots(1, 1,figsize=(10,5))
sns.set_style("whitegrid", {'axes.grid' : False})
sex=Kaggle_NDegree.Q2.value_counts().sort_values(ascending=False).to_frame()
ax1=sns.barplot(data=sex,x=sex.index,y='Q2',palette="coolwarm")
ax1.set_title('Different type of Sex in Survey',fontsize=21, fontweight='bold')
ax1.set_xlabel('Sex')
ax1.set_ylabel('')
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90);
ax1.set_yticks([])
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '1.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
for s in ['top', 'left', 'right', 'bottom']:
ax1.spines[s].set_visible(False)
fig, ax = plt.subplots(1,1, figsize=(15,10))
age_sex=Kaggle_NDegree.groupby(['Q1'])['Q2'].value_counts().unstack().sort_index()
man=age_sex["Man"].to_frame()
woman=-age_sex["Woman"].to_frame()
ax=sns.barplot(data=man,x='Man',y=man.index,color="#006699",label='Male')
ax=sns.barplot(data=woman,x='Woman',y=woman.index,color="#ff3333",label='Female')
ax.set_xlim(-200, 600)
ax.set_xlabel('Number of Particepants')
ax.set_ylabel('Age Group',fontsize=15)
ax.set_title('Number of Male and Female Vs Age Group',fontsize=16, fontweight='bold')
for s in ['top', 'right', 'bottom']:
ax.spines[s].set_visible(False)
#annotate
# for p in ax.patches:
# width = p.get_width()
# plt.text(5+p.get_width(), p.get_y()+0.55*p.get_height(),
# '{:1.0f}'.format(abs(width)),
# ha='center', va='center',rotation=90)
ax.legend();
fig, ax = plt.subplots(1,1, figsize=(15,10))
XP=Kaggle_NDegree.Q5.value_counts().sort_values(ascending=False).to_frame()
ax=sns.barplot(data=XP,x=XP.index,y='Q5',palette="viridis")
ax.set_title('Different type of Jobs in Survey',fontsize=16, fontweight='bold')
ax.set_xlabel('Current role')
ax.set_ylabel('Number of Particpents')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height(), '1.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
for s in ['top', 'left', 'right', 'bottom']:
ax.spines[s].set_visible(False)
fig, ax = plt.subplots(1,1, figsize=(15,6))
XP=Kaggle_NDegree.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame()
XP1=Kaggle_WDegree.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame()
ax=sns.barplot(data=XP,y='Q5',x=XP.index,color="#006699",label='Without Degree')
ax=sns.barplot(data=-XP1,y='Q5',x=XP1.index,color="#ff3333",label='With Degree')
ax.set_ylabel('')
ax.set_xlabel('')
ax.set_title('Participants With Degree and Without Degree',fontsize=16, fontweight='bold')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
for s in ['top', 'left', 'right', 'bottom']:
ax.spines[s].set_visible(False)
#annotate
for p in ax.patches:
ax.annotate('{:.1f}%'.format(abs(100*p.get_height())),
(p.get_x() + p.get_width() / 2.,p.get_height()),
ha = 'center', va = 'center',
xytext = (0,6),
textcoords = 'offset points')
ax.set_yticks([])
ax.legend(loc='lower right');
fig, ax = plt.subplots(1,2, figsize=(10,5))
K_heat = []
for i in Kaggle_NDegree.Q6.value_counts().index.to_list():
K_heat.append(Kaggle_NDegree.Q5.loc[Kaggle_NDegree.Q6 == str(i)].value_counts().to_frame().rename(columns={'Q5':str(i)}))
res_K_heat = pd.concat(K_heat, axis=1)
K_heat_W = []
for i in Kaggle_WDegree.Q6.value_counts().index.to_list():
K_heat_W.append(Kaggle_WDegree.Q5.loc[Kaggle_WDegree.Q6 == str(i)].value_counts().to_frame().rename(columns={'Q5':str(i)}))
res_K_heat_W = pd.concat(K_heat_W, axis=1)
ax0 = sns.heatmap(res_K_heat.sort_index(axis=1), linewidths=1.2, cbar=False, annot=True, fmt='g',cmap=sns.cubehelix_palette(as_cmap=True),ax=ax[0])
ax1= sns.heatmap(res_K_heat_W.sort_index(axis=1), linewidths=1.2, cbar=False, annot=True, fmt='g',cmap=sns.cubehelix_palette(as_cmap=True),ax=ax[1])
ax0.set_title('Participants without degree',fontsize=16, fontweight='bold')
ax1.set_title('Participants with degree',fontsize=16, fontweight='bold')
ax1.set_yticks([]);
fig, ax = plt.subplots(1,1, figsize=(15,5))
XP=Kaggle_NDegree.Q6.value_counts().sort_index(ascending=False).to_frame()
ax=sns.barplot(data=XP,x=XP.index,y='Q6',palette="mako")
ax.set_title('Coding Experince',fontsize=16, fontweight='bold')
ax.set_xlabel('')
ax.set_ylabel('Number of Particpents',fontsize=21)
ax.set_xticklabels(ax.get_xticklabels())
for p in ax.patches:
ax.annotate(format(p.get_height(), '1.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
for s in ['top', 'left', 'right', 'bottom']:
ax.spines[s].set_visible(False)
ax.set_xticklabels(["I have never written code","<1 years","1-2 years","3-5 years","5-10 years","10-20 years","20+ years"],rotation=90);
Y20=Kaggle_NDegree[(Kaggle_NDegree.Q6 == "20+ years")]
fig, ax = plt.subplots(1,1, figsize=(15,6))
XP=Y20.Q5.value_counts().sort_values(ascending=False).to_frame()
ax=sns.barplot(data=XP,x=XP.index,y='Q5',palette="mako")
ax.set_title('Current job with coding experience greater then 20 years',fontsize=16, fontweight='bold')
ax.set_xlabel('Current role')
ax.set_ylabel('Number of Particpents')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height(), '1.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
for s in ['top', 'left', 'right', 'bottom']:
ax.spines[s].set_visible(False)
MostXP=Kaggle_NDegree[(Kaggle_NDegree.Q6 != "1-2 years") &
(Kaggle_NDegree.Q6 != "<1years") &
(Kaggle_NDegree.Q6 != "3-5 years")&
(Kaggle_NDegree.Q6 != "I have never written code")]
fig, ax = plt.subplots(1,1, figsize=(15,6))
XP=MostXP.Q5.value_counts().sort_values(ascending=False).to_frame()
ax=sns.barplot(data=XP,x=XP.index,y='Q5',palette="viridis")
ax.set_title('Current job with coding experience greater then 5 years',fontsize=21, fontweight='bold')
ax.set_xlabel('Current role')
ax.set_ylabel('Number of Particpents')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height(), '1.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
for s in ['top', 'left', 'right', 'bottom']:
ax.spines[s].set_visible(False)
fig, ax = plt.subplots(1,1, figsize=(15,15))
MostXP1=MostXP[(MostXP.Q5 != "Student")&(MostXP.Q5 != "Other")&(MostXP.Q5 != "Currently not employed")]
Kaggle1=Kaggle_WDegree[(Kaggle_WDegree.Q5 != "Student")&(Kaggle_WDegree.Q5 != "Other")&(Kaggle_WDegree.Q5 != "Currently not employed")]
Kaggle1=Kaggle1[(Kaggle1.Q6 != "1-2 years") &
(Kaggle1.Q6 != "< 1 years") &
(Kaggle1.Q6 != "3-5 years")&
(Kaggle1.Q6 != "I have never written code")]
XP=MostXP1.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame()
XP1=Kaggle1.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame()
ax=sns.barplot(data=XP,y='Q5',x=XP.index,color="#006699",label='Without Degree')
ax=sns.barplot(data=-XP1,y='Q5',x=XP1.index,color="#ff3333",label='With Degree')
ax.set_ylabel('')
ax.set_xlabel('')
ax.set_title('Participants more experience in Coding',fontsize=16, fontweight='bold')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
for s in ['top', 'left', 'right', 'bottom']:
ax.spines[s].set_visible(False)
#annotate
for p in ax.patches:
ax.annotate('{:.1f}%'.format(abs(100*p.get_height())),
(p.get_x() + p.get_width() / 2.,p.get_height()),
ha = 'center', va = 'center',
xytext = (0,6),
textcoords = 'offset points')
ax.set_yticks([])
ax.legend(loc='lower right');
fig, ax = plt.subplots(1,1, figsize=(15,15))
Kaggle_NXP=Kaggle_NDegree[(Kaggle_NDegree.Q5 != "Student")&(Kaggle_NDegree.Q5 != "Other")&(Kaggle_NDegree.Q5 != "Currently not employed")]
Kaggle_NXP=Kaggle_NXP[(Kaggle_NXP.Q6 == "1-2 years") |
(Kaggle_NXP.Q6 == "< 1 years") |
(Kaggle_NXP.Q6 == "3-5 years")|
(Kaggle_NXP.Q6 == "I have never written code")]
Kaggle_WXP=Kaggle_WDegree[(Kaggle_WDegree.Q5 != "Student")&(Kaggle_WDegree.Q5 != "Other")&(Kaggle_WDegree.Q5 != "Currently not employed")]
Kaggle_WXP=Kaggle_WXP[(Kaggle_WXP.Q6 == "1-2 years") |
(Kaggle_WXP.Q6 == "< 1 years") |
(Kaggle_WXP.Q6 == "3-5 years")|
(Kaggle_WXP.Q6 == "I have never written code")]
XP=Kaggle_NXP.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame()
XP1=Kaggle_WXP.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame()
ax=sns.barplot(data=XP,y='Q5',x=XP.index,color="#006699",label='Without Degree')
ax=sns.barplot(data=-XP1,y='Q5',x=XP1.index,color="#ff3333",label='With Degree')
ax.set_ylabel('')
ax.set_xlabel('')
ax.set_title('Participants With Degree and Without Degree with less Experence in coding',fontsize=16, fontweight='bold')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
for s in ['top', 'left', 'right', 'bottom']:
ax.spines[s].set_visible(False)
#annotate
for p in ax.patches:
ax.annotate('{:.1f}%'.format(abs(100*p.get_height())),
(p.get_x() + p.get_width() / 2.,p.get_height()),
ha = 'center', va = 'center',
xytext = (0,6),
textcoords = 'offset points')
ax.set_yticks([])
ax.legend(loc='lower right');
fig, ax = plt.subplots(1,1, figsize=(15,15))
Kaggle_NXP=Kaggle_NDegree[(Kaggle_NDegree.Q5 != "Student")&(Kaggle_NDegree.Q5 != "Other")&(Kaggle_NDegree.Q5 != "Currently not employed")]
Kaggle_NXP=Kaggle_NXP[(Kaggle_NXP.Q6 == "1-2 years") |
(Kaggle_NXP.Q6 == "< 1 years") |
(Kaggle_NXP.Q6 == "3-5 years")|
(Kaggle_NXP.Q6 == "I have never written code")]
Kaggle_NoXP=Kaggle_NDegree[(Kaggle_NDegree.Q5 != "Student")&(Kaggle_NDegree.Q5 != "Other")&(Kaggle_NDegree.Q5 != "Currently not employed")]
Kaggle_NoXP=Kaggle_NoXP[(Kaggle_NoXP.Q6 != "1-2 years") &
(Kaggle_NoXP.Q6 != "< 1 years") &
(Kaggle_NoXP.Q6 != "3-5 years")&
(Kaggle_NoXP.Q6 != "I have never written code")]
XP=Kaggle_NXP.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame()
XP1=Kaggle1.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame()
ax=sns.barplot(data=XP,y='Q5',x=XP.index,color="#006699",label='Less Coding XP')
ax=sns.barplot(data=-XP1,y='Q5',x=XP1.index,color="#ff3333",label='More Coding XP')
ax.set_ylabel('')
ax.set_xlabel('')
ax.set_title('Participants without Degree with current job',fontsize=16, fontweight='bold')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
for s in ['top', 'left', 'right', 'bottom']:
ax.spines[s].set_visible(False)
#annotate
for p in ax.patches:
ax.annotate('{:.1f}%'.format(abs(100*p.get_height())),
(p.get_x() + p.get_width() / 2.,p.get_height()),
ha = 'center', va = 'center',
xytext = (0,6),
textcoords = 'offset points')
ax.set_yticks([])
ax.legend(loc='lower right');
Unemployed=round(Kaggle_NDegree.Q5.value_counts(normalize=True).loc["Currently not employed"]*100,2)
employed=round(Kaggle_WDegree.Q5.value_counts(normalize=True).loc["Currently not employed"]*100,2)
[{"Unemployed without Degree":Unemployed,"Unemployed with Degree":employed}]
# Unemployed=round(Kaggle19.groupby("Q4")["Q5"].value_counts(normalize=True).loc["Currently not employed"]*100,2)
## 2020 DataSet
K20=round(D.groupby("Q4")["Q5"].value_counts(normalize=True).loc[[('With Degree', 'Currently not employed'), ('Without Degree', 'Currently not employed')]]*100,2)
K20["year"]="2020"
K20.rename({"Currently not employed":"Not employed"}, axis=1,inplace=True)
Kaggle19.Q4[(Kaggle19.Q4=="Doctoral degree") |
(Kaggle19.Q4 == "Master’s degree") |
(Kaggle19.Q4 == "Bachelor’s degree")|
(Kaggle19.Q4 == "Professional degree")] = 'With Degree'
Kaggle19.Q4[Kaggle19.Q4 != "With Degree"] = 'Without Degree'
# Unemployed=round(Kaggle19.groupby("Q4")["Q5"].value_counts(normalize=True).loc["Currently not employed"]*100,2)
## 2019 DataSet
K19=round(Kaggle19.groupby("Q4")["Q5"].value_counts(normalize=True).loc[[('With Degree', 'Not employed'), ('Without Degree', 'Not employed')]]*100,2)
K19["year"]="2019"
Kaggle18.Q4[(Kaggle18.Q4=="Doctoral degree") |
(Kaggle18.Q4 == "Master’s degree") |
(Kaggle18.Q4 == "Bachelor’s degree")|
(Kaggle18.Q4 == "Professional degree")] = 'With Degree'
Kaggle18.Q4[Kaggle18.Q4 != "With Degree"] = 'Without Degree'
## 2018 DataSet
K18=round(Kaggle18.groupby("Q4")["Q6"].value_counts(normalize=True).loc[[('With Degree', 'Not employed'), ('Without Degree', 'Not employed')]]*100,2)
# Kaggle18.groupby("Q4")["Q6"].value_counts(normalize=True)
K18["year"]="2018"
HistData=pd.DataFrame([K20,K19,K18]).reset_index(drop=True)
HistData.columns=HistData.columns.map(lambda x: '|'.join([str(i) for i in x]))
HistData.rename(columns={"With Degree|Not employed":"With degree","Without Degree|Not employed":"Without Degree","year|":"Year"},inplace=True)
HistData=HistData.melt(id_vars=['Year'], var_name='Education', value_name='Percent')
fig, ax = plt.subplots(1,1,figsize=(15,5))
ax1=sns.barplot( data=HistData,x='Year',y='Percent', hue='Education', palette="viridis")
ax1.set_title('Unemployement rate',fontsize=16, fontweight='bold')
ax1.set( xlabel="Year",ylabel="Unempoyment Rate")
ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'));
fig, ax = plt.subplots(1,1, figsize=(15,5))
XP=Kaggle_NDegree.Q24.value_counts().sort_values(ascending=False).to_frame()
ax=sns.barplot(data=XP,x=XP.index,y='Q24',palette="viridis")
ax.set_title('Pay Gap of Participants without Degree',fontsize=16, fontweight='bold')
ax.set_xlabel('Pay Range')
ax.set_ylabel('Number of Particpents')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height(), '1.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
for s in ['top', 'left', 'right', 'bottom']:
ax.spines[s].set_visible(False)
fig, ax = plt.subplots(1,1, figsize=(15,10))
XP=Kaggle_NDegree.Q24.value_counts(normalize=True).sort_index().to_frame()
XP1=Kaggle_WDegree.Q24.value_counts(normalize=True).sort_index().to_frame()
ax=sns.barplot(data=XP,y='Q24',x=XP.index,color="#006699",label='Without Degree')
ax=sns.barplot(data=-XP1,y='Q24',x=XP1.index,color="#ff3333",label='With Degree')
ax.set_ylabel('')
ax.set_xlabel('')
ax.set_title('Participants Pay Gap with and without degree',fontsize=16, fontweight='bold')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
for s in ['top', 'left', 'right', 'bottom']:
ax.spines[s].set_visible(False)
#annotate
for p in ax.patches:
ax.annotate('{:.1f}%'.format(abs(100*p.get_height())),
(p.get_x() + p.get_width() / 2.,p.get_height()),
ha = 'center', va = 'center',
xytext = (0,6),
textcoords = 'offset points')
ax.set_yticks([])
ax.legend(loc='lower right');
import pycountry
Map=Kaggle_NDegree.Q3.value_counts().to_frame()
def alpha3code(column):
CODE=[]
for country in column:
if country !='Other':
try:
code=pycountry.countries.search_fuzzy(country)[0].alpha_3
# .alpha_3 means 3-letter country code
# .alpha_2 means 2-letter country code
CODE.append(code)
except:
CODE.append('None')
else:
CODE.append('Other')
return CODE
# create a column for code
Map['CODE']=alpha3code(Map.index)
Map.head()
import geopandas
from geopandas import GeoDataFrame
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
# rename the columns so that we can merge with our data
world.columns=['pop_est', 'continent', 'name', 'CODE', 'gdp_md_est', 'geometry']
# then merge with our data
merge=pd.merge(Map,world,how='right',on='CODE')
# merge['Q3'] = merge['Q3'].fillna(0)
merge = GeoDataFrame(merge).sort_values(by='Q3',ascending=False)
location=pd.read_csv('https://raw.githubusercontent.com/melanieshi0120/COVID-19_global_time_series_panel_data/master/data/countries_latitude_longitude.csv')
merge=merge.merge(location,on='name').reset_index()
merge.head()
x=pd.array(merge[merge.name=="Egypt"].latitude)[0]
merge['latitude'] = merge['latitude'].replace( x,26.8357675)
merge['longitude'] = merge['longitude'].replace([-78.183406],30.7956597)
merge.plot(column='Q3', scheme="quantiles",
figsize=(25, 20), cmap='viridis',
legend=True,missing_kwds={'color': 'lightgrey',
"hatch": "///",
"label": "Missing values"} )
plt.title('2020 Participantas with No Formal Degree',fontsize=25)
# add countries names and numbers
for i in range(0,20):
plt.text(float(merge.longitude[i]),float(merge.latitude[i]),"{}\n{}".format(merge.name[i],int(merge.Q3[i])),size=10)
Map=Kaggle_WDegree.Q3.value_counts().to_frame()
def alpha3code(column):
CODE=[]
for country in column:
if country !='Other':
try:
code=pycountry.countries.search_fuzzy(country)[0].alpha_3
# .alpha_3 means 3-letter country code
# .alpha_2 means 2-letter country code
CODE.append(code)
except:
CODE.append('None')
else:
CODE.append('Other')
return CODE
# create a column for code
Map['CODE']=alpha3code(Map.index)
Map.head()
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
# rename the columns so that we can merge with our data
world.columns=['pop_est', 'continent', 'name', 'CODE', 'gdp_md_est', 'geometry']
# then merge with our data
merge=pd.merge(Map,world,how='right',on='CODE')
# merge['Q3'] = merge['Q3'].fillna(0)
merge = GeoDataFrame(merge).sort_values(by='Q3',ascending=False)
location=pd.read_csv('https://raw.githubusercontent.com/melanieshi0120/COVID-19_global_time_series_panel_data/master/data/countries_latitude_longitude.csv')
merge=merge.merge(location,on='name').reset_index()
merge.plot(column='Q3', scheme="quantiles",
figsize=(25, 20), cmap='viridis',
legend=True,missing_kwds={'color': 'lightgrey',
"hatch": "///",
"label": "Missing values"} )
plt.title('2020 Participantas with Degree',fontsize=25)
# add countries names and numbers
for i in range(0,20):
plt.text(float(merge.longitude[i]),float(merge.latitude[i]),"{}\n{}".format(merge.name[i],int(merge.Q3[i])),size=10)
Top10= round(Kaggle_NDegree.groupby("Q3")["Q24"].value_counts(normalize=True)*100,2)
Top10=Top10.to_frame().rename(columns={"Q24":"Percentage"}).reset_index().sort_values(by="Q3",ascending=False)
Top5=Top10[Top10.Q3.isin( ["India","United States of America","Japan","Russia","Brazil"])]
fig, ax = plt.subplots(1,1,figsize=(15,10))
ax1=sns.barplot( data=Top5,x='Q24',y='Percentage', hue='Q3', palette="viridis")
ax1.set_title('Pay Gap vs Top Five COuntries without Degree Normalized',fontsize=16, fontweight='bold')
ax1.set(ylabel="Percentage")
ax1.set_xlabel('Pay Gap')
plt.xticks(rotation=90)
ax1.legend(title="");
ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'));
Top5=Top5[Top5.Q24.isin( ["$0-999","1,000-1,999","10,000-14,999","15,000-19,999","25,000-29,999"])].sort_values(by="Q24")
fig, ax = plt.subplots(1,1,figsize=(15,10))
ax1=sns.barplot( data=Top5,x='Q24',y='Percentage', hue='Q3', palette="viridis")
ax1.set_title('Pay Gap vs Top Five Countries without Degree Normalized',fontsize=16, fontweight='bold')
ax1.set(ylabel="Percentage")
ax1.set_xlabel('Pay Gap')
ax1.legend(title="");
ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'));
Top10= round(Kaggle_WDegree.groupby("Q3")["Q24"].value_counts(normalize=True)*100,2)
Top10=Top10.to_frame().rename(columns={"Q24":"Percentage"}).reset_index().sort_values(by="Q3",ascending=False)
Top5=Top10[Top10.Q3.isin( ["India","United States of America","Japan","Russia","Brazil"])]
Top5=Top5[Top5.Q24.isin( ["$0-999","1,000-1,999","10,000-14,999","15,000-19,999","25,000-29,999"])].sort_values(by="Q24")
fig, ax = plt.subplots(1,1,figsize=(15,10))
ax1=sns.barplot( data=Top5,x='Q24',y='Percentage', hue='Q3', palette="viridis")
ax1.set_title('Pay Gap vs Top Five Countries With Degree Normalized',fontsize=16, fontweight='bold')
ax1.set(ylabel="Percentage")
ax1.set_xlabel('Pay Gap')
ax1.set_ylim([0,50])
ax1.legend(title="");
ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'));
India=D[D.Q3=="India"]
India_Edu=round(India.groupby("Q4")["Q24"].value_counts(normalize=True)*100,2)
India_Edu=India_Edu.to_frame().rename(columns={"Q24":"Percentage"}).reset_index().sort_values(by="Q4",ascending=False)
India_Edu=India_Edu[India_Edu.Q24.isin( ["$0-999","1,000-1,999","10,000-14,999","15,000-19,999","25,000-29,999"])].sort_values(by="Q24")
fig, ax = plt.subplots(1,1,figsize=(15,8))
ax1=sns.barplot( data=India_Edu,x='Q24',y='Percentage', hue='Q4', palette="viridis")
ax1.set_title('India Pay Gap/Without Degree Normalized',fontsize=16)
ax1.set(ylabel="Percentage")
ax1.set_xlabel('Pay Gap')
ax1.set_ylim([0,50])
ax1.legend(title="");
ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'));
India_Job=round(India.groupby(["Q4","Q24"])["Q5"].value_counts(normalize=True)*100,2).to_frame().rename(columns={"Q5":
"Percentage"}).reset_index(
).sort_values(by="Q24",
ascending=True)
India_Job=India_Job[India_Job.Q24.isin( ["$0-999","1,000-1,999","10,000-14,999","15,000-19,999","25,000-29,999"])].sort_values(by="Q24")
ax = sns.catplot(x="Q24", y="Percentage", hue="Q5",
col="Q4", data=India_Job,kind="bar")
# ax.set_title('India Pay Gap and Jobs With/Without Degree Normalized',fontsize=21, fontweight='bold')
ax.set(xlabel='Pay Gap')
ax.set_xticklabels(rotation=90);
axes = sns.relplot(x="Q24", y="Percentage", hue="Q5",
size="Q4", data=India_Job,palette='viridis', sizes=(100, 300),alpha=0.5,aspect=2)
axes.set(xlabel='Pay Gap',title="India Pay Gap and Jobs With/Without Degree Normalized",ylim=(0, 45))
axes.set_xticklabels(rotation=90);
India_Job=India_Job[India_Job.Q24.isin( ["$0-999"])]
ax23 = sns.catplot(x="Q24", y="Percentage", hue="Q5",
col="Q4", data=India_Job,kind="bar")