Kaggle Machine Learning & Data Science Survey The most comprehensive dataset available on the state of ML and data science
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings( 'ignore' )
sns.set(style='darkgrid')
plt.rcParams["figure.figsize"] = (15,6)
def get_label(g):
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height/2,
'{}'.format(round(height)),
ha="center", color='white')
df_2019 = pd.read_csv("/work/multiple_choice_responses.csv")
df_2020 = pd.read_csv("/work/kaggle_survey_2020_responses.csv")
df_2021 = pd.read_csv("/work/kaggle_survey_2021_responses.csv")
# Select just the column that i will use
df_2019 = df_2019[['Time from Start to Finish (seconds)', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5']]
df_2020 = df_2020[['Time from Start to Finish (seconds)', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5']]
df_2021 = df_2021[['Time from Start to Finish (seconds)', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5']]
# rename columns
cols = {'Time from Start to Finish (seconds)':'Time_start_to_finish', 'Q1':'age', 'Q2':'gender', 'Q3':'country_reside',
'Q4':'formal_education', 'Q5':'title_job', 'Q6':'years_writing_code', }
df_2019.rename(columns=cols, inplace=True)
cols = {'Time from Start to Finish (seconds)':'Time_start_to_finish', 'Q1':'age', 'Q2':'gender', 'Q3':'country_reside',
'Q4':'formal_education', 'Q5':'title_job', 'Q6':'years_writing_code', }
df_2020.rename(columns=cols, inplace=True)
cols = {'Time from Start to Finish (seconds)':'Time_start_to_finish', 'Q1':'age', 'Q2':'gender', 'Q3':'country_reside',
'Q4':'formal_education', 'Q5':'title_job', 'Q6':'years_writing_code', }
df_2021.rename(columns=cols, inplace=True)
df_2019.drop(0, axis=0, inplace=True)
df_2020.drop(0, axis=0, inplace=True)
df_2021.drop(0, axis=0, inplace=True)
# creare a new column to indentify what year the dataset belongs
df_2019['year'] = '2019'
df_2020['year'] = '2020'
df_2021['year'] = '2021'
aux = pd.concat([df_2019, df_2020], ignore_index=True)
df_all = pd.concat([aux, df_2021], ignore_index=True)
#Change the extense name
df_all['country_reside'] = df_all['country_reside'].str.replace('Iran, Islamic Republic of...', 'Iran')
df_all['country_reside'] = df_all['country_reside'].str.replace('United Kingdom of Great Britain and Northern Ireland', 'United Kingdom')
df_all['country_reside'] = df_all['country_reside'].replace('Hong Kong (S.A.R.)', 'Hong Kong')
# replace the gender
df_all['gender'] = df_all['gender'].str.replace('Female', 'Woman')
df_all['gender'] = df_all['gender'].str.replace('Male','Man')
# Creare a new column continent
map_continent = {'India':'Asia', 'Indonesia':'Asia', 'Pakistan':'Asia', 'Mexico':'North America', 'Russia':'Asia', 'Turkey':'Asia',
'Australia':'Australia', 'Nigeria':'Africa', 'Greece':'Europe', 'Belgium':'Europe', 'Japan':'Asia', 'Egypt':'Africa',
'Singapore':'Asia', 'Brazil':'South America', 'Poland':'Europe', 'China':'Asia','Iran':'Asia', 'United States of America':'North America',
'Italy':'Europe', 'Viet Nam':'Asia', 'Israel':'Asia', 'Peru':'South America', 'South Africa':'Africa', 'Other':'Other','Spain':'Europe', 'Bangladesh':'Asia',
'United Kingdom':'Europe', 'France':'Europe','Switzerland':'Europe', 'Algeria':'Africa', 'Tunisia':'Africa', 'Argentina':'South America', 'Sweden':'Europe',
'Colombia':'South America','I do not wish to disclose my location':'I do not wish to disclose my location', 'Canada':'North America','Chile':'South America',
'Netherlands':'Europe', 'Ukraine':'Europe', 'Saudi Arabia':'Asia', 'Romania':'Europe','Morocco':'Africa', 'Austria':'Europe', 'Taiwan':'Asia', 'Kenya':'Africa', 'Belarus':'Europe',
'Ireland':'Europe','Portugal':'Europe', 'Hong Kong':'Asia', 'Denmark':'Europe', 'Germany':'Europe','South Korea':'Asia', 'Philippines':'Asia', 'Sri Lanka':'Asia',
'United Arab Emirates':'Asia','Uganda':'Africa', 'Ghana':'Africa', 'Malaysia':'Asia', 'Thailand':'Asia', 'Nepal':'Asia', 'Kazakhstan':'Asia','Ethiopia':'Africa', 'Iraq':'Asia',
'Ecuador':'South America', 'Norway':'Europe', 'Czech Republic':'Europe', 'Kenya':'Africa', 'Republic of Korea':'Asia', 'New Zealand':'Oceanian', 'Hungary':'Europe'}
df_all['continent'] = df_all['country_reside'].map(map_continent)
# change types and trasnforming time ( seconds ) to minutes
df_all['year'] = df_all['year'].astype(int)
df_all['Time_start_to_finish'] = df_all['Time_start_to_finish'].astype(int)
df_all['Time_start_to_finish'] = df_all['Time_start_to_finish'].apply(lambda x: round(x / 60, ndigits=2))
df_all.sample()
print(f'minimum time to complete the survey: {df_all["Time_start_to_finish"].min()} minutes')
print(f'maximum time to complete the survey: {df_all["Time_start_to_finish"].max()} minutes')
df_year = df_all.groupby(['year'], as_index=False)['age'].count().rename( columns = {"age":'total'})
df_year = pd.DataFrame(df_year)
plt.figure(figsize=(15, 7))
g = sns.barplot(x='year', y='total', data = df_year, palette='Set1')
plt.title('NUMBER OF SURVEY PARTICIPANTS PER YEAR')
get_label(g)
df_age = df_all.groupby(['age', 'year'], as_index=False)['gender'].count().rename( columns = {"gender":'total'})
df_age = pd.DataFrame(df_age)
plt.figure(figsize=(15, 7))
g = sns.barplot(x='age', y='total', hue='year', data = df_age, palette='Set2')
plt.title('AGE OF PARTICIPANTS PER YEAR');
mask = (df_all['gender'] != 'Prefer not to say')
df_gender = df_all.loc[mask].groupby(['gender', 'year'], as_index=False)['age'].count().rename( columns = {'age':'total'})
plt.figure(figsize=(15, 7))
order = ['Man', 'Woman', 'Nonbinary', 'Prefer to self-describe']
g = sns.barplot(x='gender', y='total', hue='year', data = df_gender, order=order, palette='Set2')
plt.title('GENDER OF PARTICIPANTS PER YEAR');
title_job = df_all.groupby(['title_job', 'year'], as_index=False)['age'].count().rename( columns = {'age':'total'})
plt.figure(figsize=(15, 7))
g = sns.barplot(x='title_job', y='total', hue='year', data = title_job, palette='Set2')
plt.title('TITLE JOB OF PARTICIPANTS PER YEAR');
plt.xticks(rotation=90);
mask_country = (df_all['country_reside'] != 'Other')
df_country = df_all.loc[mask_country].groupby(['country_reside', 'year'], as_index=False)['gender'].count().rename( columns = {"gender":'total'})
df_country = pd.DataFrame(df_country)
df_country_10 = df_country.nlargest(30, 'total')
plt.figure(figsize=(15, 7))
g = sns.barplot(x='country_reside', y='total', hue='year', data = df_country_10, palette='magma')
plt.title('COUNTRY OF PARTICIPANTS PER YEAR');
plt.xticks(rotation=90);
mask = (df_all['continent'] != 'Other') & (df_all['continent'] != 'I do not wish to disclose my location')
df1_contnent = df_all.loc[mask].groupby(['continent', 'year'], as_index=False)['age'].count().rename( columns = {'age':'total'})
order = ['Asia', 'North America', 'Europe', 'Africa', 'South America', 'Oceanian']
g = sns.barplot(y='continent', x='total', hue='year', data = df1_contnent, order=order, palette='magma')
plt.title('CONTINENT OF PARTICIPANTS PER YEAR');