# Import of libraries and data
# Import of general libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Import of specific libraries used in few functions
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot_2samples #Used in our 2 sample qq plot
from scipy.stats import ttest_ind # For use when doing the students T-Test
from scipy.stats import mannwhitneyu #Used when looking if the train and val data is från the same distribution
from scipy.stats import pearsonr #Used in function "test_pearson_correlation"
from scipy.stats import norm #Used in function "hist_and_norm" where we take a histogram of the data and put a normal distribution curve over it, and in "get_gaussian_pdf" a simular use.
from scipy.stats import shapiro #Used in function "test_for_normality" to look if the data is normaly distributed
import pylab as py #Used when printing the QQ plot that looks if the data is normaly distributed.
from sklearn.model_selection import train_test_split #Used in the function that splits the data.
from sklearn.linear_model import LinearRegression #Used as our model
from sklearn.metrics import mean_squared_error, r2_score #used in the calculation of the result of our model
import math #Used when calculating the error from our mean square error
# Import of data
df=pd.read_csv("survey_results_public.csv")
df_nordic = df[df['Country'].isin(['Sweden','Norway','Denmark','Finland'])]
# The majority work full time and are developers as a profession. For liability of the comparison, we only focus on the people working full time employed as developers.
df_nordic_working_fulltime = df_nordic[df_nordic["Employment"] == "Employed full-time"]
df_nordic_working_developer = df_nordic_working_fulltime[df_nordic_working_fulltime['MainBranch']=='I am a developer by profession']
# We choose the variables of interest, explained in more detail further down.
comp_df = df_nordic_working_developer.iloc[:, [3,6,10,47]]
comp_df = comp_df.replace('Less than 1 year', 0)
comp_df['YearsCodePro'] = pd.to_numeric(comp_df['YearsCodePro'])
# We take away the extreme values.
comp_df = comp_df.loc[comp_df['ConvertedCompYearly'] <= 150000].reset_index()
final_df = comp_df.loc[comp_df['ConvertedCompYearly'] > 15000].reset_index()
final_df = final_df[final_df['EdLevel'].isin(['Bachelor’s degree (B.A., B.S., B.Eng., etc.)', 'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)'])]
del final_df['level_0']
del final_df['index']
final_df.dropna(inplace=True)
final_df
Countryobject
Sweden37.2%
Norway24.7%
2 others38.1%
EdLevelobject
Master’s degree (M.A., M.S., M.Eng., MBA, etc.)54.1%
Bachelor’s degree (B.A., B.S., B.Eng., etc.)45.9%
0
Sweden
Master’s degree (M.A., M.S., M.Eng., MBA, etc.)
1
Denmark
Bachelor’s degree (B.A., B.S., B.Eng., etc.)
2
Finland
Bachelor’s degree (B.A., B.S., B.Eng., etc.)
4
Sweden
Bachelor’s degree (B.A., B.S., B.Eng., etc.)
5
Finland
Master’s degree (M.A., M.S., M.Eng., MBA, etc.)
6
Sweden
Bachelor’s degree (B.A., B.S., B.Eng., etc.)
7
Norway
Bachelor’s degree (B.A., B.S., B.Eng., etc.)
9
Sweden
Master’s degree (M.A., M.S., M.Eng., MBA, etc.)
10
Sweden
Bachelor’s degree (B.A., B.S., B.Eng., etc.)
11
Finland
Bachelor’s degree (B.A., B.S., B.Eng., etc.)
# Makes dummy variables so that we can work with the categoric variables
def make_dummy(df, x_columns, y_column):#, x_var, y_var):
X = df[x_columns] # Specifies the independent variables
Y = df[y_column] #Specifies the dependent variable
# Makes the dummy variables
X = pd.get_dummies(data=X, drop_first=True) # Drops the first level as the base category
X.head()
return X, Y
X, Y = make_dummy(final_df,['Country', 'YearsCodePro', 'EdLevel'], 'ConvertedCompYearly')
X_pol, Y = make_dummy(final_df,['Country', 'YearsCodePro', 'EdLevel'], 'ConvertedCompYearly')
# Makes polynomials for the continious variables
X_pol['YearsCodePro2'] = X_pol['YearsCodePro'].pow(2)
X_pol['YearsCodePro3'] = X_pol['YearsCodePro'].pow(3)
X_pol['YearsCodePro_Master'] = X_pol['YearsCodePro']*X_pol['EdLevel_Master’s degree (M.A., M.S., M.Eng., MBA, etc.)']
# Divides the data into test, train and val data for X and Y
def test_train_val_split(X, Y):
# Splits the data into a test set and a "rest"-set. The "rest"-set is then divided into a
# training set and a validation set.
X_rest, X_test, y_rest, y_test = train_test_split(X, Y, test_size=0.1, random_state=101)
X_train, X_val, y_train, y_val = train_test_split(X_rest, y_rest, test_size=0.1111111, random_state=101)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)
print("X_train:", X_train.shape)
print("y_train", y_train.shape)
print("X_val:", X_val.shape)
print("y_val:", y_val.shape)
return(X_test,y_test,X_train,y_train,X_val,y_val)
X_test,y_test,X_train,y_train,X_val,y_val = test_train_val_split(X, Y)
X_test_pol,y_test_pol,X_train_pol,y_train_pol,X_val_pol,y_val_pol = test_train_val_split(X_pol, Y)
X_test: (112, 5)
y_test: (112,)
X_train: (891, 5)
y_train (891,)
X_val: (112, 5)
y_val: (112,)
X_test: (112, 8)
y_test: (112,)
X_train: (891, 8)
y_train (891,)
X_val: (112, 8)
y_val: (112,)
def qq_plot(data1, data2, label1, label2, variable, variable_desc):
pp_1 = sm.ProbPlot(np.array(data1.sample(71)))
pp_2 = sm.ProbPlot(np.array(data2))
qqplot_2samples(pp_1, pp_2, xlabel=label1, ylabel=label2, line="45")
plt.title(variable_desc)
qq_plot(y_train,y_val, 'Train data', 'Validation data', 'ConvertedCompYearly', 'Converted comp/year')
# Mann-Whitney U Test, to test if two independent samples have the same distribution
# H0: Same distribution
# H1: Not the same distribution
def test_same_distribution(data1, data2):
stat, p = mannwhitneyu(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
else:
print('Probably different distributions')
test_same_distribution(y_train, y_val)
stat=54243.000, p=0.133
Probably the same distribution
def make_histogram(df, df_rowname,bin_amount, graph_title, x_label, y_label):
ax = df[df_rowname].plot.hist(bins = bin_amount, title=graph_title, figsize=(8,5))
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
plt.show()
make_histogram(final_df, 'ConvertedCompYearly',40, 'Earning for nordic people working with programing full time','Dollar earned yearly','Amount of persons in each yearly income category')
make_histogram(final_df, 'YearsCodePro',20, 'Total year a person have worked with coding','Years coded profesionally','Amount of persons in each category')
# Categorized by education level (EdLevel)
sns.pairplot(final_df,hue='EdLevel')
# Correlation matrix
sns.heatmap(final_df.corr(),annot=True,lw=1)
# Pearson's Correlation test to see if there is a correlation between the dependent
# and independent variables
def test_pearson_correlation(data1, data2):
stat, p = pearsonr(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably independent')
else:
print('Probably dependent')
print("YearsCodePro VS ConvertedCompYearly:")
data1 = final_df['ConvertedCompYearly']
data2 = final_df['YearsCodePro']
test_pearson_correlation(data1, data2)
YearsCodePro VS ConvertedCompYearly:
stat=0.364, p=0.000
Probably dependent
def describe_data(df):
df_description = df.describe()
df_description.loc["range"] = df_description.loc['max'] - df_description.loc['min']
print (df_description)
describe_data(final_df)
YearsCodePro ConvertedCompYearly
count 1115.000000 1115.000000
mean 9.200000 70906.772197
std 7.360821 21771.479406
min 0.000000 25944.000000
25% 4.000000 55224.000000
50% 7.000000 67500.000000
75% 13.000000 82962.000000
max 40.000000 149364.000000
range 40.000000 123420.000000
def make_bar(df, x_rowname, y_rowname, graph_title, x_label, y_label):
ax = df.plot.bar(x = x_rowname, y = y_rowname, title=graph_title, figsize=(7,5))
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.get_legend().remove()
plt.rc('xtick', labelsize=8)
plt.xticks(rotation = 0)
plt.show()
def group_and_barchart_income(df, grouping_column, graph_title, x_label, y_label):
grouped_df = df.groupby(grouping_column) \
.agg(count=(grouping_column, 'size'), mean_yearly_income=('ConvertedCompYearly', 'mean')) \
.reset_index()
print (grouped_df)
make_bar(grouped_df, grouping_column, 'mean_yearly_income', graph_title,x_label,y_label)
def group_and_barchart_code(df, grouping_column, graph_title, x_label, y_label):
grouped_df = df.groupby(grouping_column) \
.agg(count=(grouping_column, 'size'), mean_coding=('YearsCodePro', 'mean')) \
.reset_index()
print (grouped_df)
make_bar(grouped_df, grouping_column, 'mean_coding', graph_title,x_label,y_label)
group_and_barchart_income(final_df, 'EdLevel', 'Yearly income compared to Education level','Education group','Dollar earned yearly')
group_and_barchart_income(final_df, 'Country', 'Yearly income compared to country','Country','Dollar earned yearly')
group_and_barchart_code(final_df, 'Country', 'Coding experience compared to country','Country','Years coding professinally')
EdLevel count mean_yearly_income
0 Bachelor’s degree (B.A., B.S., B.Eng., etc.) 512 66478.794922
1 Master’s degree (M.A., M.S., M.Eng., MBA, etc.) 603 74666.514096
Country count mean_yearly_income
0 Denmark 208 85443.062500
1 Finland 217 61708.368664
2 Norway 275 82421.323636
3 Sweden 415 60800.756627
Country count mean_coding
0 Denmark 208 8.298077
1 Finland 217 9.976959
2 Norway 275 8.709091
3 Sweden 415 9.571084
def hist_and_norm(data, title):
mu, std = norm.fit(data)
# Plot the histogram.
plt.hist(data, bins=25, density=True, alpha=0.6)
# Plot the PDF.
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
plt.title(title)
plt.show()
hist_and_norm(final_df['ConvertedCompYearly'],'The distribution of yearly converted income, and the normal distribution curve')
hist_and_norm(final_df['YearsCodePro'],'The distribution of years coding professionally, and the normal distribution curve')
# Here the test is done for the normal distribution
def QQ_plot_normal(data, title):
normalized_train_df=(data-data.mean())/data.std()
sm.qqplot(normalized_train_df, line='45')
plt.title(title)
py.show()
QQ_plot_normal(final_df['ConvertedCompYearly'], "ConvertedCompYearly" )
QQ_plot_normal(final_df['YearsCodePro'], 'YearsCodePro')
# Example of the Shapiro-Wilk Normality Test
def test_for_normality(data):
stat, p = shapiro(data)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably Gaussian')
else:
print('Probably not Gaussian')
test_for_normality(final_df['ConvertedCompYearly'])
test_for_normality(final_df['YearsCodePro'])
stat=0.963, p=0.000
Probably not Gaussian
stat=0.897, p=0.000
Probably not Gaussian
print ("Means:")
print (final_df.mean())
print ("Standard deviations:")
print (final_df.std())
Means:
YearsCodePro 9.200000
ConvertedCompYearly 70906.772197
dtype: float64
Standard deviations:
YearsCodePro 7.360821
ConvertedCompYearly 21771.479406
dtype: float64
def get_gaussian_pdf(data, sample_size=1000):
_estimated_mean = data.mean()
_estimated_std = data.std()
x = np.linspace(data.min(), data.max(), num=sample_size)
pdf = norm.pdf(x, _estimated_mean, _estimated_std)
return pdf, x, _estimated_mean, _estimated_std
def plot_pdfs(data1, data2, x1_PDF, x1_hist, x2_PDF, x2_hist, title, dep_var_label):
fig, ax = plt.subplots(1, 1, figsize=(8,4))#, dpi=dpi)
# Compare these two groups
pdf_1, rvf, _, _ = get_gaussian_pdf(data1)
pdf_2, rvm, _, _ = get_gaussian_pdf(data2)
ax.plot(rvf, pdf_1, label=x1_PDF)
ax.hist(data1, bins=100, density=True, label=x1_hist)
ax.plot(rvm, pdf_2, label=x2_PDF)
ax.hist(data2, bins=100, density=True, label=x2_hist)
ax.set_title(title)
ax.set_xlabel(dep_var_label)
ax.set_ylabel("Probability density")
ax.legend();
# Get gaussian PDF and plot for Comparison of income between Masters and Bachelor level
data1 = np.array(final_df[final_df["EdLevel"] == "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)"]["ConvertedCompYearly"])#.sample(40, random_state=42))
data2 = np.array(final_df[final_df["EdLevel"] == "Bachelor’s degree (B.A., B.S., B.Eng., etc.)"]["ConvertedCompYearly"])#.sample(40, random_state=42))
x1_PDF = "Master’s degree estimated PDF"
x1_hist = "Master’s degree histogram"
x2_PDF = "Bachelor’s degree estimated PDF"
x2_hist = "Bachelor’s degree histogram"
title = "Comparison of yearly income between Masters and Bachelor level"
dep_var_label = "Income"
plot_pdfs(data1, data2, x1_PDF, x1_hist, x2_PDF, x2_hist, title, dep_var_label)
# Student's t-test, to see if the data have the same mean
data1 = final_df[final_df["EdLevel"] == "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)"]["ConvertedCompYearly"]
data2 = final_df[final_df["EdLevel"] == "Bachelor’s degree (B.A., B.S., B.Eng., etc.)"]["ConvertedCompYearly"]
stat, p = ttest_ind(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
else:
print('Probably different distributions')
stat=6.368, p=0.000
Probably different distributions
def linear_regression(X_train,y_train,X_val,y_val, data_label,graph_title, x_axis_label, y_axis_label):
# Import of linear regression and definition of model
model = LinearRegression()
model.fit(X_train,y_train)
# print the intercept and parameter coefficents
print('Intercept for train data:', model.intercept_)
coeff_parameter = pd.DataFrame(model.coef_,X_train.columns,columns=['Coefficient'])
print('Parameter Coefficents for train data:', coeff_parameter)
# Compute predicted values and plots the predicted values against the true values in
# the validation data
y_pred = model.predict(X_val)
sns.regplot(y_val,y_pred)
ax = sns.regplot(x=y_val, y=y_pred, x_bins=6)
ax.set_title(graph_title, fontsize = 15)
ax.set_xlabel(x_axis_label, fontsize = 12)
ax.set_ylabel(y_axis_label, fontsize = 12)
print("Results for the", data_label)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_val, y_pred))
# Root mean squared error, which has the same units as the estimator
mse = mean_squared_error(y_val, y_pred)
mse
RMSE = math.sqrt(mse)
print("RMSE", RMSE)
# R-squared and coefficients with p-values
X_train_Sm= sm.add_constant(X_train)
ls=sm.OLS(y_train,X_train_Sm).fit()
print(ls.summary())
plt.show()
linear_regression(X_train,y_train,X_val,y_val, "validation data",'Multivariate linear regression' ,'Validation data yearly income', 'Predicted yearly income')
Intercept for train data: 72765.5109923062
Parameter Coefficents for train data: Coefficient
YearsCodePro 1172.899046
Country_Finland -24968.628845
Country_Norway -4115.391342
Country_Sweden -26058.717273
EdLevel_Master’s degree (M.A., M.S., M.Eng., MB... 5554.477576
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
Results for the validation data
Mean squared error: 230583620.13
RMSE 15184.980083191187
OLS Regression Results
===============================================================================
Dep. Variable: ConvertedCompYearly R-squared: 0.456
Model: OLS Adj. R-squared: 0.453
Method: Least Squares F-statistic: 148.3
Date: Wed, 05 Jan 2022 Prob (F-statistic): 2.36e-114
Time: 12:27:06 Log-Likelihood: -9877.2
No. Observations: 891 AIC: 1.977e+04
Df Residuals: 885 BIC: 1.980e+04
Df Model: 5
Covariance Type: nonrobust
===========================================================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------------------------------------
const 7.277e+04 1508.039 48.252 0.000 6.98e+04 7.57e+04
YearsCodePro 1172.8990 72.221 16.240 0.000 1031.154 1314.644
Country_Finland -2.497e+04 1747.983 -14.284 0.000 -2.84e+04 -2.15e+04
Country_Norway -4115.3913 1627.491 -2.529 0.012 -7309.583 -921.199
Country_Sweden -2.606e+04 1523.692 -17.102 0.000 -2.9e+04 -2.31e+04
EdLevel_Master’s degree (M.A., M.S., M.Eng., MBA, etc.) 5554.4776 1067.664 5.202 0.000 3459.030 7649.926
==============================================================================
Omnibus: 167.314 Durbin-Watson: 2.049
Prob(Omnibus): 0.000 Jarque-Bera (JB): 424.460
Skew: 0.982 Prob(JB): 6.75e-93
Kurtosis: 5.753 Cond. No. 60.5
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
linear_regression(X_train_pol,y_train_pol,X_val_pol,y_val_pol, "validation data",'Multivariate third grade polynominal linear regression' ,'Validation yearly income', 'Predicted yearly income')
Intercept for train data: 62674.5087776754
Parameter Coefficents for train data: Coefficient
YearsCodePro 4403.424102
Country_Finland -25319.216236
Country_Norway -3587.930609
Country_Sweden -26049.438921
EdLevel_Master’s degree (M.A., M.S., M.Eng., MB... 3343.607981
YearsCodePro2 -188.673753
YearsCodePro3 2.488769
YearsCodePro_Master 170.895879
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
Results for the validation data
Mean squared error: 215025919.51
RMSE 14663.762119953397
OLS Regression Results
===============================================================================
Dep. Variable: ConvertedCompYearly R-squared: 0.518
Model: OLS Adj. R-squared: 0.513
Method: Least Squares F-statistic: 118.4
Date: Wed, 05 Jan 2022 Prob (F-statistic): 4.13e-134
Time: 12:33:29 Log-Likelihood: -9823.4
No. Observations: 891 AIC: 1.966e+04
Df Residuals: 882 BIC: 1.971e+04
Df Model: 8
Covariance Type: nonrobust
===========================================================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------------------------------------
const 6.267e+04 1955.184 32.056 0.000 5.88e+04 6.65e+04
YearsCodePro 4403.4241 427.948 10.290 0.000 3563.510 5243.339
Country_Finland -2.532e+04 1653.031 -15.317 0.000 -2.86e+04 -2.21e+04
Country_Norway -3587.9306 1538.938 -2.331 0.020 -6608.338 -567.524
Country_Sweden -2.605e+04 1439.713 -18.093 0.000 -2.89e+04 -2.32e+04
EdLevel_Master’s degree (M.A., M.S., M.Eng., MBA, etc.) 3343.6080 1619.215 2.065 0.039 165.645 6521.571
YearsCodePro2 -188.6738 32.315 -5.839 0.000 -252.097 -125.250
YearsCodePro3 2.4888 0.662 3.762 0.000 1.190 3.787
YearsCodePro_Master 170.8959 138.463 1.234 0.217 -100.859 442.651
==============================================================================
Omnibus: 161.858 Durbin-Watson: 2.027
Prob(Omnibus): 0.000 Jarque-Bera (JB): 435.713
Skew: 0.931 Prob(JB): 2.43e-95
Kurtosis: 5.876 Cond. No. 3.72e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.72e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
linear_regression(X_train,y_train,X_test,y_test, "test data",'Multivariate linear regression' ,'Test data yearly income', 'Predicted yearly income')
Intercept for train data: 72765.5109923062
Parameter Coefficents for train data: Coefficient
YearsCodePro 1172.899046
Country_Finland -24968.628845
Country_Norway -4115.391342
Country_Sweden -26058.717273
EdLevel_Master’s degree (M.A., M.S., M.Eng., MB... 5554.477576
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
Results for the test data
Mean squared error: 361499358.21
RMSE 19013.136464252933
OLS Regression Results
===============================================================================
Dep. Variable: ConvertedCompYearly R-squared: 0.456
Model: OLS Adj. R-squared: 0.453
Method: Least Squares F-statistic: 148.3
Date: Wed, 05 Jan 2022 Prob (F-statistic): 2.36e-114
Time: 12:36:09 Log-Likelihood: -9877.2
No. Observations: 891 AIC: 1.977e+04
Df Residuals: 885 BIC: 1.980e+04
Df Model: 5
Covariance Type: nonrobust
===========================================================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------------------------------------
const 7.277e+04 1508.039 48.252 0.000 6.98e+04 7.57e+04
YearsCodePro 1172.8990 72.221 16.240 0.000 1031.154 1314.644
Country_Finland -2.497e+04 1747.983 -14.284 0.000 -2.84e+04 -2.15e+04
Country_Norway -4115.3913 1627.491 -2.529 0.012 -7309.583 -921.199
Country_Sweden -2.606e+04 1523.692 -17.102 0.000 -2.9e+04 -2.31e+04
EdLevel_Master’s degree (M.A., M.S., M.Eng., MBA, etc.) 5554.4776 1067.664 5.202 0.000 3459.030 7649.926
==============================================================================
Omnibus: 167.314 Durbin-Watson: 2.049
Prob(Omnibus): 0.000 Jarque-Bera (JB): 424.460
Skew: 0.982 Prob(JB): 6.75e-93
Kurtosis: 5.753 Cond. No. 60.5
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
qq_plot(y_train,y_test, 'Train data', 'Test data', 'ConvertedCompYearly', 'Yearly income')
test_same_distribution(y_train, y_test)
qq_plot(y_val,y_test, 'Val data', 'Test data', 'ConvertedCompYearly', 'Yearly income')
test_same_distribution(y_val, y_test)
stat=49822.500, p=0.980
Probably the same distribution
stat=5777.000, p=0.308
Probably the same distribution