Alexandras och Mattias projekt

# Import of libraries and data # Import of general libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # Import of specific libraries used in few functions import statsmodels.api as sm from statsmodels.graphics.gofplots import qqplot_2samples #Used in our 2 sample qq plot from scipy.stats import ttest_ind # For use when doing the students T-Test from scipy.stats import mannwhitneyu #Used when looking if the train and val data is från the same distribution from scipy.stats import pearsonr #Used in function "test_pearson_correlation" from scipy.stats import norm #Used in function "hist_and_norm" where we take a histogram of the data and put a normal distribution curve over it, and in "get_gaussian_pdf" a simular use. from scipy.stats import shapiro #Used in function "test_for_normality" to look if the data is normaly distributed import pylab as py #Used when printing the QQ plot that looks if the data is normaly distributed. from sklearn.model_selection import train_test_split #Used in the function that splits the data. from sklearn.linear_model import LinearRegression #Used as our model from sklearn.metrics import mean_squared_error, r2_score #used in the calculation of the result of our model import math #Used when calculating the error from our mean square error

# Import of data df=pd.read_csv("survey_results_public.csv") df_nordic = df[df['Country'].isin(['Sweden','Norway','Denmark','Finland'])] # The majority work full time and are developers as a profession. For liability of the comparison, we only focus on the people working full time employed as developers. df_nordic_working_fulltime = df_nordic[df_nordic["Employment"] == "Employed full-time"] df_nordic_working_developer = df_nordic_working_fulltime[df_nordic_working_fulltime['MainBranch']=='I am a developer by profession'] # We choose the variables of interest, explained in more detail further down. comp_df = df_nordic_working_developer.iloc[:, [3,6,10,47]] comp_df = comp_df.replace('Less than 1 year', 0) comp_df['YearsCodePro'] = pd.to_numeric(comp_df['YearsCodePro']) # We take away the extreme values. comp_df = comp_df.loc[comp_df['ConvertedCompYearly'] <= 150000].reset_index() final_df = comp_df.loc[comp_df['ConvertedCompYearly'] > 15000].reset_index() final_df = final_df[final_df['EdLevel'].isin(['Bachelor’s degree (B.A., B.S., B.Eng., etc.)', 'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)'])] del final_df['level_0'] del final_df['index'] final_df.dropna(inplace=True) final_df

# Makes dummy variables so that we can work with the categoric variables def make_dummy(df, x_columns, y_column):#, x_var, y_var): X = df[x_columns] # Specifies the independent variables Y = df[y_column] #Specifies the dependent variable # Makes the dummy variables X = pd.get_dummies(data=X, drop_first=True) # Drops the first level as the base category X.head() return X, Y X, Y = make_dummy(final_df,['Country', 'YearsCodePro', 'EdLevel'], 'ConvertedCompYearly') X_pol, Y = make_dummy(final_df,['Country', 'YearsCodePro', 'EdLevel'], 'ConvertedCompYearly')

# Makes polynomials for the continious variables X_pol['YearsCodePro2'] = X_pol['YearsCodePro'].pow(2) X_pol['YearsCodePro3'] = X_pol['YearsCodePro'].pow(3) X_pol['YearsCodePro_Master'] = X_pol['YearsCodePro']*X_pol['EdLevel_Master’s degree (M.A., M.S., M.Eng., MBA, etc.)']

# Divides the data into test, train and val data for X and Y def test_train_val_split(X, Y): # Splits the data into a test set and a "rest"-set. The "rest"-set is then divided into a # training set and a validation set. X_rest, X_test, y_rest, y_test = train_test_split(X, Y, test_size=0.1, random_state=101) X_train, X_val, y_train, y_val = train_test_split(X_rest, y_rest, test_size=0.1111111, random_state=101) print("X_test:", X_test.shape) print("y_test:", y_test.shape) print("X_train:", X_train.shape) print("y_train", y_train.shape) print("X_val:", X_val.shape) print("y_val:", y_val.shape) return(X_test,y_test,X_train,y_train,X_val,y_val) X_test,y_test,X_train,y_train,X_val,y_val = test_train_val_split(X, Y) X_test_pol,y_test_pol,X_train_pol,y_train_pol,X_val_pol,y_val_pol = test_train_val_split(X_pol, Y)

def qq_plot(data1, data2, label1, label2, variable, variable_desc): pp_1 = sm.ProbPlot(np.array(data1.sample(71))) pp_2 = sm.ProbPlot(np.array(data2)) qqplot_2samples(pp_1, pp_2, xlabel=label1, ylabel=label2, line="45") plt.title(variable_desc) qq_plot(y_train,y_val, 'Train data', 'Validation data', 'ConvertedCompYearly', 'Converted comp/year')

# Mann-Whitney U Test, to test if two independent samples have the same distribution # H0: Same distribution # H1: Not the same distribution def test_same_distribution(data1, data2): stat, p = mannwhitneyu(data1, data2) print('stat=%.3f, p=%.3f' % (stat, p)) if p > 0.05: print('Probably the same distribution') else: print('Probably different distributions') test_same_distribution(y_train, y_val)

def make_histogram(df, df_rowname,bin_amount, graph_title, x_label, y_label): ax = df[df_rowname].plot.hist(bins = bin_amount, title=graph_title, figsize=(8,5)) ax.set_xlabel(x_label) ax.set_ylabel(y_label) plt.show() make_histogram(final_df, 'ConvertedCompYearly',40, 'Earning for nordic people working with programing full time','Dollar earned yearly','Amount of persons in each yearly income category') make_histogram(final_df, 'YearsCodePro',20, 'Total year a person have worked with coding','Years coded profesionally','Amount of persons in each category')

# Categorized by education level (EdLevel) sns.pairplot(final_df,hue='EdLevel')

# Correlation matrix sns.heatmap(final_df.corr(),annot=True,lw=1) # Pearson's Correlation test to see if there is a correlation between the dependent # and independent variables def test_pearson_correlation(data1, data2): stat, p = pearsonr(data1, data2) print('stat=%.3f, p=%.3f' % (stat, p)) if p > 0.05: print('Probably independent') else: print('Probably dependent') print("YearsCodePro VS ConvertedCompYearly:") data1 = final_df['ConvertedCompYearly'] data2 = final_df['YearsCodePro'] test_pearson_correlation(data1, data2)

def describe_data(df): df_description = df.describe() df_description.loc["range"] = df_description.loc['max'] - df_description.loc['min'] print (df_description) describe_data(final_df)

def make_bar(df, x_rowname, y_rowname, graph_title, x_label, y_label): ax = df.plot.bar(x = x_rowname, y = y_rowname, title=graph_title, figsize=(7,5)) ax.set_xlabel(x_label) ax.set_ylabel(y_label) ax.get_legend().remove() plt.rc('xtick', labelsize=8) plt.xticks(rotation = 0) plt.show() def group_and_barchart_income(df, grouping_column, graph_title, x_label, y_label): grouped_df = df.groupby(grouping_column) \ .agg(count=(grouping_column, 'size'), mean_yearly_income=('ConvertedCompYearly', 'mean')) \ .reset_index() print (grouped_df) make_bar(grouped_df, grouping_column, 'mean_yearly_income', graph_title,x_label,y_label) def group_and_barchart_code(df, grouping_column, graph_title, x_label, y_label): grouped_df = df.groupby(grouping_column) \ .agg(count=(grouping_column, 'size'), mean_coding=('YearsCodePro', 'mean')) \ .reset_index() print (grouped_df) make_bar(grouped_df, grouping_column, 'mean_coding', graph_title,x_label,y_label) group_and_barchart_income(final_df, 'EdLevel', 'Yearly income compared to Education level','Education group','Dollar earned yearly') group_and_barchart_income(final_df, 'Country', 'Yearly income compared to country','Country','Dollar earned yearly') group_and_barchart_code(final_df, 'Country', 'Coding experience compared to country','Country','Years coding professinally')

def hist_and_norm(data, title): mu, std = norm.fit(data) # Plot the histogram. plt.hist(data, bins=25, density=True, alpha=0.6) # Plot the PDF. xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) plt.plot(x, p, 'k', linewidth=2) plt.title(title) plt.show() hist_and_norm(final_df['ConvertedCompYearly'],'The distribution of yearly converted income, and the normal distribution curve') hist_and_norm(final_df['YearsCodePro'],'The distribution of years coding professionally, and the normal distribution curve')

# Here the test is done for the normal distribution def QQ_plot_normal(data, title): normalized_train_df=(data-data.mean())/data.std() sm.qqplot(normalized_train_df, line='45') plt.title(title) py.show() QQ_plot_normal(final_df['ConvertedCompYearly'], "ConvertedCompYearly" ) QQ_plot_normal(final_df['YearsCodePro'], 'YearsCodePro')

# Example of the Shapiro-Wilk Normality Test def test_for_normality(data): stat, p = shapiro(data) print('stat=%.3f, p=%.3f' % (stat, p)) if p > 0.05: print('Probably Gaussian') else: print('Probably not Gaussian') test_for_normality(final_df['ConvertedCompYearly']) test_for_normality(final_df['YearsCodePro'])

print ("Means:") print (final_df.mean()) print ("Standard deviations:") print (final_df.std())

def get_gaussian_pdf(data, sample_size=1000): _estimated_mean = data.mean() _estimated_std = data.std() x = np.linspace(data.min(), data.max(), num=sample_size) pdf = norm.pdf(x, _estimated_mean, _estimated_std) return pdf, x, _estimated_mean, _estimated_std def plot_pdfs(data1, data2, x1_PDF, x1_hist, x2_PDF, x2_hist, title, dep_var_label): fig, ax = plt.subplots(1, 1, figsize=(8,4))#, dpi=dpi) # Compare these two groups pdf_1, rvf, _, _ = get_gaussian_pdf(data1) pdf_2, rvm, _, _ = get_gaussian_pdf(data2) ax.plot(rvf, pdf_1, label=x1_PDF) ax.hist(data1, bins=100, density=True, label=x1_hist) ax.plot(rvm, pdf_2, label=x2_PDF) ax.hist(data2, bins=100, density=True, label=x2_hist) ax.set_title(title) ax.set_xlabel(dep_var_label) ax.set_ylabel("Probability density") ax.legend(); # Get gaussian PDF and plot for Comparison of income between Masters and Bachelor level data1 = np.array(final_df[final_df["EdLevel"] == "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)"]["ConvertedCompYearly"])#.sample(40, random_state=42)) data2 = np.array(final_df[final_df["EdLevel"] == "Bachelor’s degree (B.A., B.S., B.Eng., etc.)"]["ConvertedCompYearly"])#.sample(40, random_state=42)) x1_PDF = "Master’s degree estimated PDF" x1_hist = "Master’s degree histogram" x2_PDF = "Bachelor’s degree estimated PDF" x2_hist = "Bachelor’s degree histogram" title = "Comparison of yearly income between Masters and Bachelor level" dep_var_label = "Income" plot_pdfs(data1, data2, x1_PDF, x1_hist, x2_PDF, x2_hist, title, dep_var_label)

# Student's t-test, to see if the data have the same mean data1 = final_df[final_df["EdLevel"] == "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)"]["ConvertedCompYearly"] data2 = final_df[final_df["EdLevel"] == "Bachelor’s degree (B.A., B.S., B.Eng., etc.)"]["ConvertedCompYearly"] stat, p = ttest_ind(data1, data2) print('stat=%.3f, p=%.3f' % (stat, p)) if p > 0.05: print('Probably the same distribution') else: print('Probably different distributions')

def linear_regression(X_train,y_train,X_val,y_val, data_label,graph_title, x_axis_label, y_axis_label): # Import of linear regression and definition of model model = LinearRegression() model.fit(X_train,y_train) # print the intercept and parameter coefficents print('Intercept for train data:', model.intercept_) coeff_parameter = pd.DataFrame(model.coef_,X_train.columns,columns=['Coefficient']) print('Parameter Coefficents for train data:', coeff_parameter) # Compute predicted values and plots the predicted values against the true values in # the validation data y_pred = model.predict(X_val) sns.regplot(y_val,y_pred) ax = sns.regplot(x=y_val, y=y_pred, x_bins=6) ax.set_title(graph_title, fontsize = 15) ax.set_xlabel(x_axis_label, fontsize = 12) ax.set_ylabel(y_axis_label, fontsize = 12) print("Results for the", data_label) # The mean squared error print("Mean squared error: %.2f" % mean_squared_error(y_val, y_pred)) # Root mean squared error, which has the same units as the estimator mse = mean_squared_error(y_val, y_pred) mse RMSE = math.sqrt(mse) print("RMSE", RMSE) # R-squared and coefficients with p-values X_train_Sm= sm.add_constant(X_train) ls=sm.OLS(y_train,X_train_Sm).fit() print(ls.summary()) plt.show()

linear_regression(X_train,y_train,X_val,y_val, "validation data",'Multivariate linear regression' ,'Validation data yearly income', 'Predicted yearly income')

linear_regression(X_train_pol,y_train_pol,X_val_pol,y_val_pol, "validation data",'Multivariate third grade polynominal linear regression' ,'Validation yearly income', 'Predicted yearly income')

linear_regression(X_train,y_train,X_test,y_test, "test data",'Multivariate linear regression' ,'Test data yearly income', 'Predicted yearly income')

qq_plot(y_train,y_test, 'Train data', 'Test data', 'ConvertedCompYearly', 'Yearly income') test_same_distribution(y_train, y_test) qq_plot(y_val,y_test, 'Val data', 'Test data', 'ConvertedCompYearly', 'Yearly income') test_same_distribution(y_val, y_test)