# Import of libraries and data
# Import of general libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Import of specific libraries used in few functions
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot_2samples #Used in our 2 sample qq plot
from scipy.stats import ttest_ind # For use when doing the students T-Test
from scipy.stats import mannwhitneyu #Used when looking if the train and val data is från the same distribution
from scipy.stats import pearsonr #Used in function "test_pearson_correlation"
from scipy.stats import norm #Used in function "hist_and_norm" where we take a histogram of the data and put a normal distribution curve over it, and in "get_gaussian_pdf" a simular use.
from scipy.stats import shapiro #Used in function "test_for_normality" to look if the data is normaly distributed
import pylab as py #Used when printing the QQ plot that looks if the data is normaly distributed.
from sklearn.model_selection import train_test_split #Used in the function that splits the data.
from sklearn.linear_model import LinearRegression #Used as our model
from sklearn.metrics import mean_squared_error, r2_score #used in the calculation of the result of our model
import math #Used when calculating the error from our mean square error
# Import of data
df=pd.read_csv("survey_results_public.csv")
df_nordic = df[df['Country'].isin(['Sweden','Norway','Denmark','Finland'])]
# The majority work full time and are developers as a profession. For liability of the comparison, we only focus on the people working full time employed as developers.
df_nordic_working_fulltime = df_nordic[df_nordic["Employment"] == "Employed full-time"]
df_nordic_working_developer = df_nordic_working_fulltime[df_nordic_working_fulltime['MainBranch']=='I am a developer by profession']
# We choose the variables of interest, explained in more detail further down.
comp_df = df_nordic_working_developer.iloc[:, [3,6,10,47]]
comp_df = comp_df.replace('Less than 1 year', 0)
comp_df['YearsCodePro'] = pd.to_numeric(comp_df['YearsCodePro'])
# We take away the extreme values.
comp_df = comp_df.loc[comp_df['ConvertedCompYearly'] <= 150000].reset_index()
final_df = comp_df.loc[comp_df['ConvertedCompYearly'] > 15000].reset_index()
final_df = final_df[final_df['EdLevel'].isin(['Bachelor’s degree (B.A., B.S., B.Eng., etc.)', 'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)'])]
del final_df['level_0']
del final_df['index']
final_df.dropna(inplace=True)
final_df
# Makes dummy variables so that we can work with the categoric variables
def make_dummy(df, x_columns, y_column):#, x_var, y_var):
X = df[x_columns] # Specifies the independent variables
Y = df[y_column] #Specifies the dependent variable
# Makes the dummy variables
X = pd.get_dummies(data=X, drop_first=True) # Drops the first level as the base category
X.head()
return X, Y
X, Y = make_dummy(final_df,['Country', 'YearsCodePro', 'EdLevel'], 'ConvertedCompYearly')
X_pol, Y = make_dummy(final_df,['Country', 'YearsCodePro', 'EdLevel'], 'ConvertedCompYearly')
# Makes polynomials for the continious variables
X_pol['YearsCodePro2'] = X_pol['YearsCodePro'].pow(2)
X_pol['YearsCodePro3'] = X_pol['YearsCodePro'].pow(3)
X_pol['YearsCodePro_Master'] = X_pol['YearsCodePro']*X_pol['EdLevel_Master’s degree (M.A., M.S., M.Eng., MBA, etc.)']
# Divides the data into test, train and val data for X and Y
def test_train_val_split(X, Y):
# Splits the data into a test set and a "rest"-set. The "rest"-set is then divided into a
# training set and a validation set.
X_rest, X_test, y_rest, y_test = train_test_split(X, Y, test_size=0.1, random_state=101)
X_train, X_val, y_train, y_val = train_test_split(X_rest, y_rest, test_size=0.1111111, random_state=101)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)
print("X_train:", X_train.shape)
print("y_train", y_train.shape)
print("X_val:", X_val.shape)
print("y_val:", y_val.shape)
return(X_test,y_test,X_train,y_train,X_val,y_val)
X_test,y_test,X_train,y_train,X_val,y_val = test_train_val_split(X, Y)
X_test_pol,y_test_pol,X_train_pol,y_train_pol,X_val_pol,y_val_pol = test_train_val_split(X_pol, Y)
def qq_plot(data1, data2, label1, label2, variable, variable_desc):
pp_1 = sm.ProbPlot(np.array(data1.sample(71)))
pp_2 = sm.ProbPlot(np.array(data2))
qqplot_2samples(pp_1, pp_2, xlabel=label1, ylabel=label2, line="45")
plt.title(variable_desc)
qq_plot(y_train,y_val, 'Train data', 'Validation data', 'ConvertedCompYearly', 'Converted comp/year')
# Mann-Whitney U Test, to test if two independent samples have the same distribution
# H0: Same distribution
# H1: Not the same distribution
def test_same_distribution(data1, data2):
stat, p = mannwhitneyu(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
else:
print('Probably different distributions')
test_same_distribution(y_train, y_val)
def make_histogram(df, df_rowname,bin_amount, graph_title, x_label, y_label):
ax = df[df_rowname].plot.hist(bins = bin_amount, title=graph_title, figsize=(8,5))
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
plt.show()
make_histogram(final_df, 'ConvertedCompYearly',40, 'Earning for nordic people working with programing full time','Dollar earned yearly','Amount of persons in each yearly income category')
make_histogram(final_df, 'YearsCodePro',20, 'Total year a person have worked with coding','Years coded profesionally','Amount of persons in each category')
# Categorized by education level (EdLevel)
sns.pairplot(final_df,hue='EdLevel')
# Correlation matrix
sns.heatmap(final_df.corr(),annot=True,lw=1)
# Pearson's Correlation test to see if there is a correlation between the dependent
# and independent variables
def test_pearson_correlation(data1, data2):
stat, p = pearsonr(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably independent')
else:
print('Probably dependent')
print("YearsCodePro VS ConvertedCompYearly:")
data1 = final_df['ConvertedCompYearly']
data2 = final_df['YearsCodePro']
test_pearson_correlation(data1, data2)
def describe_data(df):
df_description = df.describe()
df_description.loc["range"] = df_description.loc['max'] - df_description.loc['min']
print (df_description)
describe_data(final_df)
def make_bar(df, x_rowname, y_rowname, graph_title, x_label, y_label):
ax = df.plot.bar(x = x_rowname, y = y_rowname, title=graph_title, figsize=(7,5))
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.get_legend().remove()
plt.rc('xtick', labelsize=8)
plt.xticks(rotation = 0)
plt.show()
def group_and_barchart_income(df, grouping_column, graph_title, x_label, y_label):
grouped_df = df.groupby(grouping_column) \
.agg(count=(grouping_column, 'size'), mean_yearly_income=('ConvertedCompYearly', 'mean')) \
.reset_index()
print (grouped_df)
make_bar(grouped_df, grouping_column, 'mean_yearly_income', graph_title,x_label,y_label)
def group_and_barchart_code(df, grouping_column, graph_title, x_label, y_label):
grouped_df = df.groupby(grouping_column) \
.agg(count=(grouping_column, 'size'), mean_coding=('YearsCodePro', 'mean')) \
.reset_index()
print (grouped_df)
make_bar(grouped_df, grouping_column, 'mean_coding', graph_title,x_label,y_label)
group_and_barchart_income(final_df, 'EdLevel', 'Yearly income compared to Education level','Education group','Dollar earned yearly')
group_and_barchart_income(final_df, 'Country', 'Yearly income compared to country','Country','Dollar earned yearly')
group_and_barchart_code(final_df, 'Country', 'Coding experience compared to country','Country','Years coding professinally')
def hist_and_norm(data, title):
mu, std = norm.fit(data)
# Plot the histogram.
plt.hist(data, bins=25, density=True, alpha=0.6)
# Plot the PDF.
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
plt.title(title)
plt.show()
hist_and_norm(final_df['ConvertedCompYearly'],'The distribution of yearly converted income, and the normal distribution curve')
hist_and_norm(final_df['YearsCodePro'],'The distribution of years coding professionally, and the normal distribution curve')
# Here the test is done for the normal distribution
def QQ_plot_normal(data, title):
normalized_train_df=(data-data.mean())/data.std()
sm.qqplot(normalized_train_df, line='45')
plt.title(title)
py.show()
QQ_plot_normal(final_df['ConvertedCompYearly'], "ConvertedCompYearly" )
QQ_plot_normal(final_df['YearsCodePro'], 'YearsCodePro')
# Example of the Shapiro-Wilk Normality Test
def test_for_normality(data):
stat, p = shapiro(data)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably Gaussian')
else:
print('Probably not Gaussian')
test_for_normality(final_df['ConvertedCompYearly'])
test_for_normality(final_df['YearsCodePro'])
print ("Means:")
print (final_df.mean())
print ("Standard deviations:")
print (final_df.std())
def get_gaussian_pdf(data, sample_size=1000):
_estimated_mean = data.mean()
_estimated_std = data.std()
x = np.linspace(data.min(), data.max(), num=sample_size)
pdf = norm.pdf(x, _estimated_mean, _estimated_std)
return pdf, x, _estimated_mean, _estimated_std
def plot_pdfs(data1, data2, x1_PDF, x1_hist, x2_PDF, x2_hist, title, dep_var_label):
fig, ax = plt.subplots(1, 1, figsize=(8,4))#, dpi=dpi)
# Compare these two groups
pdf_1, rvf, _, _ = get_gaussian_pdf(data1)
pdf_2, rvm, _, _ = get_gaussian_pdf(data2)
ax.plot(rvf, pdf_1, label=x1_PDF)
ax.hist(data1, bins=100, density=True, label=x1_hist)
ax.plot(rvm, pdf_2, label=x2_PDF)
ax.hist(data2, bins=100, density=True, label=x2_hist)
ax.set_title(title)
ax.set_xlabel(dep_var_label)
ax.set_ylabel("Probability density")
ax.legend();
# Get gaussian PDF and plot for Comparison of income between Masters and Bachelor level
data1 = np.array(final_df[final_df["EdLevel"] == "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)"]["ConvertedCompYearly"])#.sample(40, random_state=42))
data2 = np.array(final_df[final_df["EdLevel"] == "Bachelor’s degree (B.A., B.S., B.Eng., etc.)"]["ConvertedCompYearly"])#.sample(40, random_state=42))
x1_PDF = "Master’s degree estimated PDF"
x1_hist = "Master’s degree histogram"
x2_PDF = "Bachelor’s degree estimated PDF"
x2_hist = "Bachelor’s degree histogram"
title = "Comparison of yearly income between Masters and Bachelor level"
dep_var_label = "Income"
plot_pdfs(data1, data2, x1_PDF, x1_hist, x2_PDF, x2_hist, title, dep_var_label)
# Student's t-test, to see if the data have the same mean
data1 = final_df[final_df["EdLevel"] == "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)"]["ConvertedCompYearly"]
data2 = final_df[final_df["EdLevel"] == "Bachelor’s degree (B.A., B.S., B.Eng., etc.)"]["ConvertedCompYearly"]
stat, p = ttest_ind(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
else:
print('Probably different distributions')
def linear_regression(X_train,y_train,X_val,y_val, data_label,graph_title, x_axis_label, y_axis_label):
# Import of linear regression and definition of model
model = LinearRegression()
model.fit(X_train,y_train)
# print the intercept and parameter coefficents
print('Intercept for train data:', model.intercept_)
coeff_parameter = pd.DataFrame(model.coef_,X_train.columns,columns=['Coefficient'])
print('Parameter Coefficents for train data:', coeff_parameter)
# Compute predicted values and plots the predicted values against the true values in
# the validation data
y_pred = model.predict(X_val)
sns.regplot(y_val,y_pred)
ax = sns.regplot(x=y_val, y=y_pred, x_bins=6)
ax.set_title(graph_title, fontsize = 15)
ax.set_xlabel(x_axis_label, fontsize = 12)
ax.set_ylabel(y_axis_label, fontsize = 12)
print("Results for the", data_label)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_val, y_pred))
# Root mean squared error, which has the same units as the estimator
mse = mean_squared_error(y_val, y_pred)
mse
RMSE = math.sqrt(mse)
print("RMSE", RMSE)
# R-squared and coefficients with p-values
X_train_Sm= sm.add_constant(X_train)
ls=sm.OLS(y_train,X_train_Sm).fit()
print(ls.summary())
plt.show()
linear_regression(X_train,y_train,X_val,y_val, "validation data",'Multivariate linear regression' ,'Validation data yearly income', 'Predicted yearly income')
linear_regression(X_train_pol,y_train_pol,X_val_pol,y_val_pol, "validation data",'Multivariate third grade polynominal linear regression' ,'Validation yearly income', 'Predicted yearly income')
linear_regression(X_train,y_train,X_test,y_test, "test data",'Multivariate linear regression' ,'Test data yearly income', 'Predicted yearly income')
qq_plot(y_train,y_test, 'Train data', 'Test data', 'ConvertedCompYearly', 'Yearly income')
test_same_distribution(y_train, y_test)
qq_plot(y_val,y_test, 'Val data', 'Test data', 'ConvertedCompYearly', 'Yearly income')
test_same_distribution(y_val, y_test)