import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_absolute_error
df = pd.read_csv('LifeExpectancyData-20211218-120103.csv')
df
Countryobject
Afghanistan0.5%
Albania0.5%
191 others98.9%
Yearint64
2000 - 2015
0
Afghanistan
2015
1
Afghanistan
2014
2
Afghanistan
2013
3
Afghanistan
2012
4
Afghanistan
2011
5
Afghanistan
2010
6
Afghanistan
2009
7
Afghanistan
2008
8
Afghanistan
2007
9
Afghanistan
2006
# https://stackoverflow.com/questions/29432629/plot-correlation-matrix-using-pandas
f = plt.figure(figsize=(9, 9))
plt.matshow(df.corr(), fignum=f.number)
plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=13, ha="left", rotation=45)
plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=13)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=11)
plt.title('Correlation Matrix', fontsize=16);
sns.pairplot(df, x_vars=['Life expectancy'])
# https://www.webmath.com/equline1.html
# y=-8/3x+730/3
xs = list(range(35, 100, 5))
ys = [-8*x/3 + 250 for x in xs]
print(ys)
[156.66666666666669, 143.33333333333331, 130.0, 116.66666666666666, 103.33333333333334, 90.0, 76.66666666666666, 63.33333333333334, 50.0, 36.66666666666666, 23.333333333333343, 10.0, -3.333333333333343]
import plotly.graph_objects as go
fig = go.Figure()
# Add traces
fig.add_trace(go.Scatter(x=df['Life expectancy'], y=df['Adult Mortality'],
mode='markers',
name='markers'))
fig.add_trace(go.Scatter(x=xs, y=ys,
mode='lines',
name='lines'))
fig.show()
wrong_mortality_indexes = -8*df['Life expectancy']/3 + 250 > df['Adult Mortality']
df.loc[wrong_mortality_indexes, 'Adult Mortality'] = df.loc[wrong_mortality_indexes, 'Adult Mortality'] * 10
wrong_mortality_indexes = -8*df['Life expectancy']/3 + 250 > df['Adult Mortality'] # dealing with the second shift of the decimal point
df.loc[wrong_mortality_indexes, 'Adult Mortality'] = df.loc[wrong_mortality_indexes, 'Adult Mortality'] * 10
fig, ax = plt.subplots()
ax.scatter(x = df['Life expectancy'], y = df['BMI'])
plt.ylabel('BMI', fontsize=13)
plt.xlabel('Life expectancy', fontsize=13)
plt.show()
SELECT *
FROM df
WHERE BMI<10
Countryobject
Viet Nam1.8%
Portugal1.4%
160 others96.8%
Yearint64
2000 - 2015
0
Albania
2006
1
Algeria
2007
2
Angola
2010
3
Antigua and Barbuda
2004
4
Antigua and Barbuda
2003
5
Argentina
2011
6
Armenia
2009
7
Armenia
2008
8
Australia
2004
9
Australia
2003
df_clean = df
df_clean = df_clean.drop(df_clean[df_clean['BMI']<10].index)
df_clean = df_clean.drop(df_clean[df_clean['BMI']>40].index)
df_clean['GDP per capita'] = df_clean['GDP']/df_clean['Population']
df_clean.columns[df_clean.isna().any()].tolist()
df_clean = df_clean.fillna(df_clean.median())
target = df_clean['Life expectancy']
data = df_clean.drop(columns=['Life expectancy', 'Thinness 5-9 years', 'Percentage expenditure', 'Under-five deaths'])
numeric_cols = [
"Year",
"Adult Mortality",
"Infant deaths",
"Alcohol",
"Hepatitis B",
"Measles",
"BMI",
"Polio",
"Total expenditure",
"Diphtheria",
"HIV/AIDS",
"GDP",
"Population",
"Thinness 1-19 years",
"Income composition of resources",
"Schooling",
"GDP per capita",
]
normalized_numeric_cols = pd.DataFrame(
RobustScaler().fit_transform(data[numeric_cols]), columns=numeric_cols
)
normalized_numeric_cols.shape
categorical_cols = [
'Status',
'Country',
]
one_hot_cols = pd.get_dummies(data[categorical_cols]).reset_index(drop=True)
one_hot_cols.shape
unchanged_cols = [c for c in data.columns if c not in numeric_cols and c not in categorical_cols]
other_cols = data[unchanged_cols].reset_index(drop=True)
ready_data = pd.concat([normalized_numeric_cols, one_hot_cols, other_cols], axis=1)
Xtrain, Xtest, ytrain, ytest = train_test_split(ready_data, target, test_size=0.1, random_state=42)
lin_reg = LinearRegression()
lin_reg.fit(Xtrain, ytrain)
ridge_reg = Ridge(alpha=0.001)
ridge_reg.fit(Xtrain, ytrain)
lasso_reg = LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10])
lasso_reg.fit(Xtrain, ytrain)
lin_preds = lin_reg.predict(Xtest)
ridge_preds = ridge_reg.predict(Xtest)
lasso_preds = lasso_reg.predict(Xtest)
lin_mae = mean_absolute_error(ytest, lin_preds)
ridge_mae = mean_absolute_error(ytest, ridge_preds)
lasso_mae = mean_absolute_error(ytest, lasso_preds)
print(lin_mae, ridge_mae, lasso_mae)
0.9189330205453181 0.9187763814155553 0.9599013264811995