ANALYSIS OF THE LAST 3 KAGGLE SURVEYS [PART 2]

Kaggle Machine Learning & Data Science Survey The most comprehensive dataset available on the state of ML and data science

import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import warnings warnings.filterwarnings( 'ignore' ) sns.set(style='darkgrid') plt.rcParams["figure.figsize"] = (15,6)

def get_label(g): for p in g.patches: height = p.get_height() g.text(p.get_x()+p.get_width()/2., height/2, '{}'.format(round(height)), ha="center", color='white')

df_2019 = pd.read_csv("/work/multiple_choice_responses.csv") df_2020 = pd.read_csv("/work/kaggle_survey_2020_responses.csv") df_2021 = pd.read_csv("/work/kaggle_survey_2021_responses.csv")

# Select just the column that i will use df_2019 = df_2019[['Time from Start to Finish (seconds)', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5']] df_2020 = df_2020[['Time from Start to Finish (seconds)', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5']] df_2021 = df_2021[['Time from Start to Finish (seconds)', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5']] # rename columns cols = {'Time from Start to Finish (seconds)':'Time_start_to_finish', 'Q1':'age', 'Q2':'gender', 'Q3':'country_reside', 'Q4':'formal_education', 'Q5':'title_job', 'Q6':'years_writing_code', } df_2019.rename(columns=cols, inplace=True) cols = {'Time from Start to Finish (seconds)':'Time_start_to_finish', 'Q1':'age', 'Q2':'gender', 'Q3':'country_reside', 'Q4':'formal_education', 'Q5':'title_job', 'Q6':'years_writing_code', } df_2020.rename(columns=cols, inplace=True) cols = {'Time from Start to Finish (seconds)':'Time_start_to_finish', 'Q1':'age', 'Q2':'gender', 'Q3':'country_reside', 'Q4':'formal_education', 'Q5':'title_job', 'Q6':'years_writing_code', } df_2021.rename(columns=cols, inplace=True) df_2019.drop(0, axis=0, inplace=True) df_2020.drop(0, axis=0, inplace=True) df_2021.drop(0, axis=0, inplace=True) # creare a new column to indentify what year the dataset belongs df_2019['year'] = '2019' df_2020['year'] = '2020' df_2021['year'] = '2021' aux = pd.concat([df_2019, df_2020], ignore_index=True) df_all = pd.concat([aux, df_2021], ignore_index=True) #Change the extense name df_all['country_reside'] = df_all['country_reside'].str.replace('Iran, Islamic Republic of...', 'Iran') df_all['country_reside'] = df_all['country_reside'].str.replace('United Kingdom of Great Britain and Northern Ireland', 'United Kingdom') df_all['country_reside'] = df_all['country_reside'].replace('Hong Kong (S.A.R.)', 'Hong Kong') # replace the gender df_all['gender'] = df_all['gender'].str.replace('Female', 'Woman') df_all['gender'] = df_all['gender'].str.replace('Male','Man') # Creare a new column continent map_continent = {'India':'Asia', 'Indonesia':'Asia', 'Pakistan':'Asia', 'Mexico':'North America', 'Russia':'Asia', 'Turkey':'Asia', 'Australia':'Australia', 'Nigeria':'Africa', 'Greece':'Europe', 'Belgium':'Europe', 'Japan':'Asia', 'Egypt':'Africa', 'Singapore':'Asia', 'Brazil':'South America', 'Poland':'Europe', 'China':'Asia','Iran':'Asia', 'United States of America':'North America', 'Italy':'Europe', 'Viet Nam':'Asia', 'Israel':'Asia', 'Peru':'South America', 'South Africa':'Africa', 'Other':'Other','Spain':'Europe', 'Bangladesh':'Asia', 'United Kingdom':'Europe', 'France':'Europe','Switzerland':'Europe', 'Algeria':'Africa', 'Tunisia':'Africa', 'Argentina':'South America', 'Sweden':'Europe', 'Colombia':'South America','I do not wish to disclose my location':'I do not wish to disclose my location', 'Canada':'North America','Chile':'South America', 'Netherlands':'Europe', 'Ukraine':'Europe', 'Saudi Arabia':'Asia', 'Romania':'Europe','Morocco':'Africa', 'Austria':'Europe', 'Taiwan':'Asia', 'Kenya':'Africa', 'Belarus':'Europe', 'Ireland':'Europe','Portugal':'Europe', 'Hong Kong':'Asia', 'Denmark':'Europe', 'Germany':'Europe','South Korea':'Asia', 'Philippines':'Asia', 'Sri Lanka':'Asia', 'United Arab Emirates':'Asia','Uganda':'Africa', 'Ghana':'Africa', 'Malaysia':'Asia', 'Thailand':'Asia', 'Nepal':'Asia', 'Kazakhstan':'Asia','Ethiopia':'Africa', 'Iraq':'Asia', 'Ecuador':'South America', 'Norway':'Europe', 'Czech Republic':'Europe', 'Kenya':'Africa', 'Republic of Korea':'Asia', 'New Zealand':'Oceanian', 'Hungary':'Europe'} df_all['continent'] = df_all['country_reside'].map(map_continent) # change types and trasnforming time ( seconds ) to minutes df_all['year'] = df_all['year'].astype(int) df_all['Time_start_to_finish'] = df_all['Time_start_to_finish'].astype(int) df_all['Time_start_to_finish'] = df_all['Time_start_to_finish'].apply(lambda x: round(x / 60, ndigits=2))

df_all.sample()

print(f'minimum time to complete the survey: {df_all["Time_start_to_finish"].min()} minutes') print(f'maximum time to complete the survey: {df_all["Time_start_to_finish"].max()} minutes')

df_year = df_all.groupby(['year'], as_index=False)['age'].count().rename( columns = {"age":'total'}) df_year = pd.DataFrame(df_year)

plt.figure(figsize=(15, 7)) g = sns.barplot(x='year', y='total', data = df_year, palette='Set1') plt.title('NUMBER OF SURVEY PARTICIPANTS PER YEAR') get_label(g)

df_age = df_all.groupby(['age', 'year'], as_index=False)['gender'].count().rename( columns = {"gender":'total'}) df_age = pd.DataFrame(df_age)

plt.figure(figsize=(15, 7)) g = sns.barplot(x='age', y='total', hue='year', data = df_age, palette='Set2') plt.title('AGE OF PARTICIPANTS PER YEAR');

mask = (df_all['gender'] != 'Prefer not to say') df_gender = df_all.loc[mask].groupby(['gender', 'year'], as_index=False)['age'].count().rename( columns = {'age':'total'})

plt.figure(figsize=(15, 7)) order = ['Man', 'Woman', 'Nonbinary', 'Prefer to self-describe'] g = sns.barplot(x='gender', y='total', hue='year', data = df_gender, order=order, palette='Set2') plt.title('GENDER OF PARTICIPANTS PER YEAR');

title_job = df_all.groupby(['title_job', 'year'], as_index=False)['age'].count().rename( columns = {'age':'total'})

plt.figure(figsize=(15, 7)) g = sns.barplot(x='title_job', y='total', hue='year', data = title_job, palette='Set2') plt.title('TITLE JOB OF PARTICIPANTS PER YEAR'); plt.xticks(rotation=90);

mask_country = (df_all['country_reside'] != 'Other') df_country = df_all.loc[mask_country].groupby(['country_reside', 'year'], as_index=False)['gender'].count().rename( columns = {"gender":'total'}) df_country = pd.DataFrame(df_country) df_country_10 = df_country.nlargest(30, 'total')

plt.figure(figsize=(15, 7)) g = sns.barplot(x='country_reside', y='total', hue='year', data = df_country_10, palette='magma') plt.title('COUNTRY OF PARTICIPANTS PER YEAR'); plt.xticks(rotation=90);

mask = (df_all['continent'] != 'Other') & (df_all['continent'] != 'I do not wish to disclose my location') df1_contnent = df_all.loc[mask].groupby(['continent', 'year'], as_index=False)['age'].count().rename( columns = {'age':'total'})

order = ['Asia', 'North America', 'Europe', 'Africa', 'South America', 'Oceanian'] g = sns.barplot(y='continent', x='total', hue='year', data = df1_contnent, order=order, palette='magma') plt.title('CONTINENT OF PARTICIPANTS PER YEAR');

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Kaggle Machine Learning &amp; Data Science Survey The most comprehensive dataset available on the state of ML and data science

Kaggle Machine Learning & Data Science Survey The most comprehensive dataset available on the state of ML and data science