Kaggle Competition 2020 DS

!pip install seaborn==0.11.0

import numpy as np import pandas as pd pd.set_option('display.max_columns', None) import seaborn as sns import matplotlib.pyplot as plt import warnings warnings.simplefilter(action='ignore')

Kaggle=pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv") Kaggle.drop([0],axis=0,inplace=True) Kaggle['time'] = Kaggle['Time from Start to Finish (seconds)'].astype(int) Kaggle.drop("Time from Start to Finish (seconds)",axis=1,inplace=True) Kaggle['time'] = pd.to_datetime(Kaggle['time'], unit='s').dt.time first_col=Kaggle.pop('time') Kaggle.insert(0, 'time', first_col) Kaggle.set_index('time',inplace=True) Kaggle["Year"]="2020" Kaggle.head(3)

Kaggle19=pd.read_csv("../input/kaggle-survey-2019/multiple_choice_responses.csv") Kaggle19.drop([0],axis=0,inplace=True) Kaggle19['time'] = Kaggle19['Time from Start to Finish (seconds)'].astype(int) Kaggle19.drop("Time from Start to Finish (seconds)",axis=1,inplace=True) Kaggle19['time'] = pd.to_datetime(Kaggle19['time'], unit='s').dt.time first_col=Kaggle19.pop('time') Kaggle19.insert(0, 'time', first_col) Kaggle19.set_index('time',inplace=True) Kaggle19["Year"]="2019" Kaggle19.head(3)

Kaggle18=pd.read_csv("../input/kaggle-survey-2018/multipleChoiceResponses.csv") Kaggle18.drop([0],axis=0,inplace=True) Kaggle18['time'] = Kaggle18['Time from Start to Finish (seconds)'].astype(int) Kaggle18.drop("Time from Start to Finish (seconds)",axis=1,inplace=True) Kaggle18['time'] = pd.to_datetime(Kaggle18['time'], unit='s').dt.time first_col=Kaggle18.pop('time') Kaggle18.insert(0, 'time', first_col) Kaggle18.set_index('time',inplace=True) Kaggle18["Year"]="2018" Kaggle18.head(3)

Kaggle_NDegree=Kaggle[(Kaggle.Q4 != "Doctoral degree") & (Kaggle.Q4 != "Master’s degree") & (Kaggle.Q4 != "Bachelor’s degree")& (Kaggle.Q4 != "Professional degree")] Kaggle_WDegree=Kaggle[(Kaggle.Q4 == "Doctoral degree") | (Kaggle.Q4 == "Master’s degree") | (Kaggle.Q4 == "Bachelor’s degree")| (Kaggle.Q4 == "Professional degree")] Kaggle_WDegree.head(3); Kaggle_NDegree.head(3)

D=Kaggle D.Q4[(D.Q4 == "Doctoral degree") | (D.Q4 == "Master’s degree") | (D.Q4 == "Bachelor’s degree")| (D.Q4 == "Professional degree")] = 'With Degree' D.Q4[D.Q4 != "With Degree"] = 'Without Degree'

fig, ax = plt.subplots(1,1,figsize=(15,5)) ax1=sns.histplot(D.sort_values(by="Q1"), x="Q1", kde=True, hue='Q4', palette="viridis") ax1.set_title('Age Group with and without Degree',fontsize=16, fontweight='bold') ax1.set(xlabel='',ylabel="Age Group");

catD=D.groupby("Q1")["Q4"].value_counts().unstack() catD["With Degree %"]= ((catD["With Degree"]/sum(catD["With Degree"]))*100).round(1) catD["Without Degree %"]= ((catD["Without Degree"]/sum(catD["Without Degree"]))*100).round(1) catD.sort_index(inplace=True) catD

import matplotlib.ticker as mtick catNew=catD[["With Degree %","Without Degree %"]].stack() catNew=pd.DataFrame(catNew).reset_index().rename(columns={0:"count"}) fig, ax = plt.subplots(1,1,figsize=(15,5)) ax1=sns.barplot( data=catNew.sort_values(by="Q1"),x='Q1',y='count', hue='Q4', palette="viridis") ax1.set_title('Age Group with and without Degree Normalized',fontsize=16, fontweight='bold') ax1.set(xlabel='Age Group',ylabel="") ax1.legend(title=""); ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'));

f, axes = plt.subplots(1, 1,figsize=(10,5)) sns.set_style("whitegrid", {'axes.grid' : False}) sex=Kaggle_NDegree.Q2.value_counts().sort_values(ascending=False).to_frame() ax1=sns.barplot(data=sex,x=sex.index,y='Q2',palette="coolwarm") ax1.set_title('Different type of Sex in Survey',fontsize=21, fontweight='bold') ax1.set_xlabel('Sex') ax1.set_ylabel('') ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90); ax1.set_yticks([]) for p in ax1.patches: ax1.annotate(format(p.get_height(), '1.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 9), textcoords = 'offset points') for s in ['top', 'left', 'right', 'bottom']: ax1.spines[s].set_visible(False)

fig, ax = plt.subplots(1,1, figsize=(15,10)) age_sex=Kaggle_NDegree.groupby(['Q1'])['Q2'].value_counts().unstack().sort_index() man=age_sex["Man"].to_frame() woman=-age_sex["Woman"].to_frame() ax=sns.barplot(data=man,x='Man',y=man.index,color="#006699",label='Male') ax=sns.barplot(data=woman,x='Woman',y=woman.index,color="#ff3333",label='Female') ax.set_xlim(-200, 600) ax.set_xlabel('Number of Particepants') ax.set_ylabel('Age Group',fontsize=15) ax.set_title('Number of Male and Female Vs Age Group',fontsize=16, fontweight='bold') for s in ['top', 'right', 'bottom']: ax.spines[s].set_visible(False) #annotate # for p in ax.patches: # width = p.get_width() # plt.text(5+p.get_width(), p.get_y()+0.55*p.get_height(), # '{:1.0f}'.format(abs(width)), # ha='center', va='center',rotation=90) ax.legend();

fig, ax = plt.subplots(1,1, figsize=(15,10)) XP=Kaggle_NDegree.Q5.value_counts().sort_values(ascending=False).to_frame() ax=sns.barplot(data=XP,x=XP.index,y='Q5',palette="viridis") ax.set_title('Different type of Jobs in Survey',fontsize=16, fontweight='bold') ax.set_xlabel('Current role') ax.set_ylabel('Number of Particpents') ax.set_xticklabels(ax.get_xticklabels(),rotation=90) for p in ax.patches: ax.annotate(format(p.get_height(), '1.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 9), textcoords = 'offset points') for s in ['top', 'left', 'right', 'bottom']: ax.spines[s].set_visible(False)

fig, ax = plt.subplots(1,1, figsize=(15,6)) XP=Kaggle_NDegree.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame() XP1=Kaggle_WDegree.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame() ax=sns.barplot(data=XP,y='Q5',x=XP.index,color="#006699",label='Without Degree') ax=sns.barplot(data=-XP1,y='Q5',x=XP1.index,color="#ff3333",label='With Degree') ax.set_ylabel('') ax.set_xlabel('') ax.set_title('Participants With Degree and Without Degree',fontsize=16, fontweight='bold') ax.set_xticklabels(ax.get_xticklabels(),rotation=90) for s in ['top', 'left', 'right', 'bottom']: ax.spines[s].set_visible(False) #annotate for p in ax.patches: ax.annotate('{:.1f}%'.format(abs(100*p.get_height())), (p.get_x() + p.get_width() / 2.,p.get_height()), ha = 'center', va = 'center', xytext = (0,6), textcoords = 'offset points') ax.set_yticks([]) ax.legend(loc='lower right');

fig, ax = plt.subplots(1,2, figsize=(10,5)) K_heat = [] for i in Kaggle_NDegree.Q6.value_counts().index.to_list(): K_heat.append(Kaggle_NDegree.Q5.loc[Kaggle_NDegree.Q6 == str(i)].value_counts().to_frame().rename(columns={'Q5':str(i)})) res_K_heat = pd.concat(K_heat, axis=1) K_heat_W = [] for i in Kaggle_WDegree.Q6.value_counts().index.to_list(): K_heat_W.append(Kaggle_WDegree.Q5.loc[Kaggle_WDegree.Q6 == str(i)].value_counts().to_frame().rename(columns={'Q5':str(i)})) res_K_heat_W = pd.concat(K_heat_W, axis=1) ax0 = sns.heatmap(res_K_heat.sort_index(axis=1), linewidths=1.2, cbar=False, annot=True, fmt='g',cmap=sns.cubehelix_palette(as_cmap=True),ax=ax[0]) ax1= sns.heatmap(res_K_heat_W.sort_index(axis=1), linewidths=1.2, cbar=False, annot=True, fmt='g',cmap=sns.cubehelix_palette(as_cmap=True),ax=ax[1]) ax0.set_title('Participants without degree',fontsize=16, fontweight='bold') ax1.set_title('Participants with degree',fontsize=16, fontweight='bold') ax1.set_yticks([]);

fig, ax = plt.subplots(1,1, figsize=(15,5)) XP=Kaggle_NDegree.Q6.value_counts().sort_index(ascending=False).to_frame() ax=sns.barplot(data=XP,x=XP.index,y='Q6',palette="mako") ax.set_title('Coding Experince',fontsize=16, fontweight='bold') ax.set_xlabel('') ax.set_ylabel('Number of Particpents',fontsize=21) ax.set_xticklabels(ax.get_xticklabels()) for p in ax.patches: ax.annotate(format(p.get_height(), '1.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 9), textcoords = 'offset points') for s in ['top', 'left', 'right', 'bottom']: ax.spines[s].set_visible(False) ax.set_xticklabels(["I have never written code","<1 years","1-2 years","3-5 years","5-10 years","10-20 years","20+ years"],rotation=90);

Y20=Kaggle_NDegree[(Kaggle_NDegree.Q6 == "20+ years")] fig, ax = plt.subplots(1,1, figsize=(15,6)) XP=Y20.Q5.value_counts().sort_values(ascending=False).to_frame() ax=sns.barplot(data=XP,x=XP.index,y='Q5',palette="mako") ax.set_title('Current job with coding experience greater then 20 years',fontsize=16, fontweight='bold') ax.set_xlabel('Current role') ax.set_ylabel('Number of Particpents') ax.set_xticklabels(ax.get_xticklabels(),rotation=90) for p in ax.patches: ax.annotate(format(p.get_height(), '1.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 9), textcoords = 'offset points') for s in ['top', 'left', 'right', 'bottom']: ax.spines[s].set_visible(False)

MostXP=Kaggle_NDegree[(Kaggle_NDegree.Q6 != "1-2 years") & (Kaggle_NDegree.Q6 != "<1years") & (Kaggle_NDegree.Q6 != "3-5 years")& (Kaggle_NDegree.Q6 != "I have never written code")]

fig, ax = plt.subplots(1,1, figsize=(15,6)) XP=MostXP.Q5.value_counts().sort_values(ascending=False).to_frame() ax=sns.barplot(data=XP,x=XP.index,y='Q5',palette="viridis") ax.set_title('Current job with coding experience greater then 5 years',fontsize=21, fontweight='bold') ax.set_xlabel('Current role') ax.set_ylabel('Number of Particpents') ax.set_xticklabels(ax.get_xticklabels(),rotation=90) for p in ax.patches: ax.annotate(format(p.get_height(), '1.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 9), textcoords = 'offset points') for s in ['top', 'left', 'right', 'bottom']: ax.spines[s].set_visible(False)

fig, ax = plt.subplots(1,1, figsize=(15,15)) MostXP1=MostXP[(MostXP.Q5 != "Student")&(MostXP.Q5 != "Other")&(MostXP.Q5 != "Currently not employed")] Kaggle1=Kaggle_WDegree[(Kaggle_WDegree.Q5 != "Student")&(Kaggle_WDegree.Q5 != "Other")&(Kaggle_WDegree.Q5 != "Currently not employed")] Kaggle1=Kaggle1[(Kaggle1.Q6 != "1-2 years") & (Kaggle1.Q6 != "< 1 years") & (Kaggle1.Q6 != "3-5 years")& (Kaggle1.Q6 != "I have never written code")] XP=MostXP1.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame() XP1=Kaggle1.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame() ax=sns.barplot(data=XP,y='Q5',x=XP.index,color="#006699",label='Without Degree') ax=sns.barplot(data=-XP1,y='Q5',x=XP1.index,color="#ff3333",label='With Degree') ax.set_ylabel('') ax.set_xlabel('') ax.set_title('Participants more experience in Coding',fontsize=16, fontweight='bold') ax.set_xticklabels(ax.get_xticklabels(),rotation=90) for s in ['top', 'left', 'right', 'bottom']: ax.spines[s].set_visible(False) #annotate for p in ax.patches: ax.annotate('{:.1f}%'.format(abs(100*p.get_height())), (p.get_x() + p.get_width() / 2.,p.get_height()), ha = 'center', va = 'center', xytext = (0,6), textcoords = 'offset points') ax.set_yticks([]) ax.legend(loc='lower right');

fig, ax = plt.subplots(1,1, figsize=(15,15)) Kaggle_NXP=Kaggle_NDegree[(Kaggle_NDegree.Q5 != "Student")&(Kaggle_NDegree.Q5 != "Other")&(Kaggle_NDegree.Q5 != "Currently not employed")] Kaggle_NXP=Kaggle_NXP[(Kaggle_NXP.Q6 == "1-2 years") | (Kaggle_NXP.Q6 == "< 1 years") | (Kaggle_NXP.Q6 == "3-5 years")| (Kaggle_NXP.Q6 == "I have never written code")] Kaggle_WXP=Kaggle_WDegree[(Kaggle_WDegree.Q5 != "Student")&(Kaggle_WDegree.Q5 != "Other")&(Kaggle_WDegree.Q5 != "Currently not employed")] Kaggle_WXP=Kaggle_WXP[(Kaggle_WXP.Q6 == "1-2 years") | (Kaggle_WXP.Q6 == "< 1 years") | (Kaggle_WXP.Q6 == "3-5 years")| (Kaggle_WXP.Q6 == "I have never written code")] XP=Kaggle_NXP.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame() XP1=Kaggle_WXP.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame() ax=sns.barplot(data=XP,y='Q5',x=XP.index,color="#006699",label='Without Degree') ax=sns.barplot(data=-XP1,y='Q5',x=XP1.index,color="#ff3333",label='With Degree') ax.set_ylabel('') ax.set_xlabel('') ax.set_title('Participants With Degree and Without Degree with less Experence in coding',fontsize=16, fontweight='bold') ax.set_xticklabels(ax.get_xticklabels(),rotation=90) for s in ['top', 'left', 'right', 'bottom']: ax.spines[s].set_visible(False) #annotate for p in ax.patches: ax.annotate('{:.1f}%'.format(abs(100*p.get_height())), (p.get_x() + p.get_width() / 2.,p.get_height()), ha = 'center', va = 'center', xytext = (0,6), textcoords = 'offset points') ax.set_yticks([]) ax.legend(loc='lower right');

fig, ax = plt.subplots(1,1, figsize=(15,15)) Kaggle_NXP=Kaggle_NDegree[(Kaggle_NDegree.Q5 != "Student")&(Kaggle_NDegree.Q5 != "Other")&(Kaggle_NDegree.Q5 != "Currently not employed")] Kaggle_NXP=Kaggle_NXP[(Kaggle_NXP.Q6 == "1-2 years") | (Kaggle_NXP.Q6 == "< 1 years") | (Kaggle_NXP.Q6 == "3-5 years")| (Kaggle_NXP.Q6 == "I have never written code")] Kaggle_NoXP=Kaggle_NDegree[(Kaggle_NDegree.Q5 != "Student")&(Kaggle_NDegree.Q5 != "Other")&(Kaggle_NDegree.Q5 != "Currently not employed")] Kaggle_NoXP=Kaggle_NoXP[(Kaggle_NoXP.Q6 != "1-2 years") & (Kaggle_NoXP.Q6 != "< 1 years") & (Kaggle_NoXP.Q6 != "3-5 years")& (Kaggle_NoXP.Q6 != "I have never written code")] XP=Kaggle_NXP.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame() XP1=Kaggle1.Q5.value_counts(normalize=True).sort_index(ascending=False).to_frame() ax=sns.barplot(data=XP,y='Q5',x=XP.index,color="#006699",label='Less Coding XP') ax=sns.barplot(data=-XP1,y='Q5',x=XP1.index,color="#ff3333",label='More Coding XP') ax.set_ylabel('') ax.set_xlabel('') ax.set_title('Participants without Degree with current job',fontsize=16, fontweight='bold') ax.set_xticklabels(ax.get_xticklabels(),rotation=90) for s in ['top', 'left', 'right', 'bottom']: ax.spines[s].set_visible(False) #annotate for p in ax.patches: ax.annotate('{:.1f}%'.format(abs(100*p.get_height())), (p.get_x() + p.get_width() / 2.,p.get_height()), ha = 'center', va = 'center', xytext = (0,6), textcoords = 'offset points') ax.set_yticks([]) ax.legend(loc='lower right');

Unemployed=round(Kaggle_NDegree.Q5.value_counts(normalize=True).loc["Currently not employed"]*100,2) employed=round(Kaggle_WDegree.Q5.value_counts(normalize=True).loc["Currently not employed"]*100,2) [{"Unemployed without Degree":Unemployed,"Unemployed with Degree":employed}]

# Unemployed=round(Kaggle19.groupby("Q4")["Q5"].value_counts(normalize=True).loc["Currently not employed"]*100,2) ## 2020 DataSet K20=round(D.groupby("Q4")["Q5"].value_counts(normalize=True).loc[[('With Degree', 'Currently not employed'), ('Without Degree', 'Currently not employed')]]*100,2) K20["year"]="2020" K20.rename({"Currently not employed":"Not employed"}, axis=1,inplace=True) Kaggle19.Q4[(Kaggle19.Q4=="Doctoral degree") | (Kaggle19.Q4 == "Master’s degree") | (Kaggle19.Q4 == "Bachelor’s degree")| (Kaggle19.Q4 == "Professional degree")] = 'With Degree' Kaggle19.Q4[Kaggle19.Q4 != "With Degree"] = 'Without Degree' # Unemployed=round(Kaggle19.groupby("Q4")["Q5"].value_counts(normalize=True).loc["Currently not employed"]*100,2) ## 2019 DataSet K19=round(Kaggle19.groupby("Q4")["Q5"].value_counts(normalize=True).loc[[('With Degree', 'Not employed'), ('Without Degree', 'Not employed')]]*100,2) K19["year"]="2019" Kaggle18.Q4[(Kaggle18.Q4=="Doctoral degree") | (Kaggle18.Q4 == "Master’s degree") | (Kaggle18.Q4 == "Bachelor’s degree")| (Kaggle18.Q4 == "Professional degree")] = 'With Degree' Kaggle18.Q4[Kaggle18.Q4 != "With Degree"] = 'Without Degree' ## 2018 DataSet K18=round(Kaggle18.groupby("Q4")["Q6"].value_counts(normalize=True).loc[[('With Degree', 'Not employed'), ('Without Degree', 'Not employed')]]*100,2) # Kaggle18.groupby("Q4")["Q6"].value_counts(normalize=True) K18["year"]="2018"

HistData=pd.DataFrame([K20,K19,K18]).reset_index(drop=True) HistData.columns=HistData.columns.map(lambda x: '|'.join([str(i) for i in x])) HistData.rename(columns={"With Degree|Not employed":"With degree","Without Degree|Not employed":"Without Degree","year|":"Year"},inplace=True) HistData=HistData.melt(id_vars=['Year'], var_name='Education', value_name='Percent') fig, ax = plt.subplots(1,1,figsize=(15,5)) ax1=sns.barplot( data=HistData,x='Year',y='Percent', hue='Education', palette="viridis") ax1.set_title('Unemployement rate',fontsize=16, fontweight='bold') ax1.set( xlabel="Year",ylabel="Unempoyment Rate") ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'));

fig, ax = plt.subplots(1,1, figsize=(15,5)) XP=Kaggle_NDegree.Q24.value_counts().sort_values(ascending=False).to_frame() ax=sns.barplot(data=XP,x=XP.index,y='Q24',palette="viridis") ax.set_title('Pay Gap of Participants without Degree',fontsize=16, fontweight='bold') ax.set_xlabel('Pay Range') ax.set_ylabel('Number of Particpents') ax.set_xticklabels(ax.get_xticklabels(),rotation=90) for p in ax.patches: ax.annotate(format(p.get_height(), '1.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 9), textcoords = 'offset points') for s in ['top', 'left', 'right', 'bottom']: ax.spines[s].set_visible(False)

fig, ax = plt.subplots(1,1, figsize=(15,10)) XP=Kaggle_NDegree.Q24.value_counts(normalize=True).sort_index().to_frame() XP1=Kaggle_WDegree.Q24.value_counts(normalize=True).sort_index().to_frame() ax=sns.barplot(data=XP,y='Q24',x=XP.index,color="#006699",label='Without Degree') ax=sns.barplot(data=-XP1,y='Q24',x=XP1.index,color="#ff3333",label='With Degree') ax.set_ylabel('') ax.set_xlabel('') ax.set_title('Participants Pay Gap with and without degree',fontsize=16, fontweight='bold') ax.set_xticklabels(ax.get_xticklabels(),rotation=90) for s in ['top', 'left', 'right', 'bottom']: ax.spines[s].set_visible(False) #annotate for p in ax.patches: ax.annotate('{:.1f}%'.format(abs(100*p.get_height())), (p.get_x() + p.get_width() / 2.,p.get_height()), ha = 'center', va = 'center', xytext = (0,6), textcoords = 'offset points') ax.set_yticks([]) ax.legend(loc='lower right');

import pycountry Map=Kaggle_NDegree.Q3.value_counts().to_frame() def alpha3code(column): CODE=[] for country in column: if country !='Other': try: code=pycountry.countries.search_fuzzy(country)[0].alpha_3 # .alpha_3 means 3-letter country code # .alpha_2 means 2-letter country code CODE.append(code) except: CODE.append('None') else: CODE.append('Other') return CODE # create a column for code Map['CODE']=alpha3code(Map.index) Map.head()

import geopandas from geopandas import GeoDataFrame world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) # rename the columns so that we can merge with our data world.columns=['pop_est', 'continent', 'name', 'CODE', 'gdp_md_est', 'geometry'] # then merge with our data merge=pd.merge(Map,world,how='right',on='CODE') # merge['Q3'] = merge['Q3'].fillna(0) merge = GeoDataFrame(merge).sort_values(by='Q3',ascending=False)

location=pd.read_csv('https://raw.githubusercontent.com/melanieshi0120/COVID-19_global_time_series_panel_data/master/data/countries_latitude_longitude.csv') merge=merge.merge(location,on='name').reset_index() merge.head()

x=pd.array(merge[merge.name=="Egypt"].latitude)[0] merge['latitude'] = merge['latitude'].replace( x,26.8357675) merge['longitude'] = merge['longitude'].replace([-78.183406],30.7956597)

merge.plot(column='Q3', scheme="quantiles", figsize=(25, 20), cmap='viridis', legend=True,missing_kwds={'color': 'lightgrey', "hatch": "///", "label": "Missing values"} ) plt.title('2020 Participantas with No Formal Degree',fontsize=25) # add countries names and numbers for i in range(0,20): plt.text(float(merge.longitude[i]),float(merge.latitude[i]),"{}\n{}".format(merge.name[i],int(merge.Q3[i])),size=10)

Map=Kaggle_WDegree.Q3.value_counts().to_frame() def alpha3code(column): CODE=[] for country in column: if country !='Other': try: code=pycountry.countries.search_fuzzy(country)[0].alpha_3 # .alpha_3 means 3-letter country code # .alpha_2 means 2-letter country code CODE.append(code) except: CODE.append('None') else: CODE.append('Other') return CODE # create a column for code Map['CODE']=alpha3code(Map.index) Map.head() world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) # rename the columns so that we can merge with our data world.columns=['pop_est', 'continent', 'name', 'CODE', 'gdp_md_est', 'geometry'] # then merge with our data merge=pd.merge(Map,world,how='right',on='CODE') # merge['Q3'] = merge['Q3'].fillna(0) merge = GeoDataFrame(merge).sort_values(by='Q3',ascending=False) location=pd.read_csv('https://raw.githubusercontent.com/melanieshi0120/COVID-19_global_time_series_panel_data/master/data/countries_latitude_longitude.csv') merge=merge.merge(location,on='name').reset_index() merge.plot(column='Q3', scheme="quantiles", figsize=(25, 20), cmap='viridis', legend=True,missing_kwds={'color': 'lightgrey', "hatch": "///", "label": "Missing values"} ) plt.title('2020 Participantas with Degree',fontsize=25) # add countries names and numbers for i in range(0,20): plt.text(float(merge.longitude[i]),float(merge.latitude[i]),"{}\n{}".format(merge.name[i],int(merge.Q3[i])),size=10)

Top10= round(Kaggle_NDegree.groupby("Q3")["Q24"].value_counts(normalize=True)*100,2) Top10=Top10.to_frame().rename(columns={"Q24":"Percentage"}).reset_index().sort_values(by="Q3",ascending=False) Top5=Top10[Top10.Q3.isin( ["India","United States of America","Japan","Russia","Brazil"])] fig, ax = plt.subplots(1,1,figsize=(15,10)) ax1=sns.barplot( data=Top5,x='Q24',y='Percentage', hue='Q3', palette="viridis") ax1.set_title('Pay Gap vs Top Five COuntries without Degree Normalized',fontsize=16, fontweight='bold') ax1.set(ylabel="Percentage") ax1.set_xlabel('Pay Gap') plt.xticks(rotation=90) ax1.legend(title=""); ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'));

Top5=Top5[Top5.Q24.isin( ["$0-999","1,000-1,999","10,000-14,999","15,000-19,999","25,000-29,999"])].sort_values(by="Q24") fig, ax = plt.subplots(1,1,figsize=(15,10)) ax1=sns.barplot( data=Top5,x='Q24',y='Percentage', hue='Q3', palette="viridis") ax1.set_title('Pay Gap vs Top Five Countries without Degree Normalized',fontsize=16, fontweight='bold') ax1.set(ylabel="Percentage") ax1.set_xlabel('Pay Gap') ax1.legend(title=""); ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'));

Top10= round(Kaggle_WDegree.groupby("Q3")["Q24"].value_counts(normalize=True)*100,2) Top10=Top10.to_frame().rename(columns={"Q24":"Percentage"}).reset_index().sort_values(by="Q3",ascending=False) Top5=Top10[Top10.Q3.isin( ["India","United States of America","Japan","Russia","Brazil"])] Top5=Top5[Top5.Q24.isin( ["$0-999","1,000-1,999","10,000-14,999","15,000-19,999","25,000-29,999"])].sort_values(by="Q24") fig, ax = plt.subplots(1,1,figsize=(15,10)) ax1=sns.barplot( data=Top5,x='Q24',y='Percentage', hue='Q3', palette="viridis") ax1.set_title('Pay Gap vs Top Five Countries With Degree Normalized',fontsize=16, fontweight='bold') ax1.set(ylabel="Percentage") ax1.set_xlabel('Pay Gap') ax1.set_ylim([0,50]) ax1.legend(title=""); ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'));

India=D[D.Q3=="India"] India_Edu=round(India.groupby("Q4")["Q24"].value_counts(normalize=True)*100,2) India_Edu=India_Edu.to_frame().rename(columns={"Q24":"Percentage"}).reset_index().sort_values(by="Q4",ascending=False) India_Edu=India_Edu[India_Edu.Q24.isin( ["$0-999","1,000-1,999","10,000-14,999","15,000-19,999","25,000-29,999"])].sort_values(by="Q24") fig, ax = plt.subplots(1,1,figsize=(15,8)) ax1=sns.barplot( data=India_Edu,x='Q24',y='Percentage', hue='Q4', palette="viridis") ax1.set_title('India Pay Gap/Without Degree Normalized',fontsize=16) ax1.set(ylabel="Percentage") ax1.set_xlabel('Pay Gap') ax1.set_ylim([0,50]) ax1.legend(title=""); ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'));

India_Job=round(India.groupby(["Q4","Q24"])["Q5"].value_counts(normalize=True)*100,2).to_frame().rename(columns={"Q5": "Percentage"}).reset_index( ).sort_values(by="Q24", ascending=True) India_Job=India_Job[India_Job.Q24.isin( ["$0-999","1,000-1,999","10,000-14,999","15,000-19,999","25,000-29,999"])].sort_values(by="Q24") ax = sns.catplot(x="Q24", y="Percentage", hue="Q5", col="Q4", data=India_Job,kind="bar") # ax.set_title('India Pay Gap and Jobs With/Without Degree Normalized',fontsize=21, fontweight='bold') ax.set(xlabel='Pay Gap') ax.set_xticklabels(rotation=90);

axes = sns.relplot(x="Q24", y="Percentage", hue="Q5", size="Q4", data=India_Job,palette='viridis', sizes=(100, 300),alpha=0.5,aspect=2) axes.set(xlabel='Pay Gap',title="India Pay Gap and Jobs With/Without Degree Normalized",ylim=(0, 45)) axes.set_xticklabels(rotation=90);

India_Job=India_Job[India_Job.Q24.isin( ["$0-999"])] ax23 = sns.catplot(x="Q24", y="Percentage", hue="Q5", col="Q4", data=India_Job,kind="bar")