import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_absolute_error
df = pd.read_csv('LifeExpectancyData-20211218-120103.csv')
df
# https://stackoverflow.com/questions/29432629/plot-correlation-matrix-using-pandas
f = plt.figure(figsize=(9, 9))
plt.matshow(df.corr(), fignum=f.number)
plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=13, ha="left", rotation=45)
plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=13)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=11)
plt.title('Correlation Matrix', fontsize=16);
sns.pairplot(df, x_vars=['Life expectancy'])
# https://www.webmath.com/equline1.html
# y=-8/3x+730/3
xs = list(range(35, 100, 5))
ys = [-8*x/3 + 250 for x in xs]
print(ys)
import plotly.graph_objects as go
fig = go.Figure()
# Add traces
fig.add_trace(go.Scatter(x=df['Life expectancy'], y=df['Adult Mortality'],
mode='markers',
name='markers'))
fig.add_trace(go.Scatter(x=xs, y=ys,
mode='lines',
name='lines'))
fig.show()
wrong_mortality_indexes = -8*df['Life expectancy']/3 + 250 > df['Adult Mortality']
df.loc[wrong_mortality_indexes, 'Adult Mortality'] = df.loc[wrong_mortality_indexes, 'Adult Mortality'] * 10
wrong_mortality_indexes = -8*df['Life expectancy']/3 + 250 > df['Adult Mortality'] # dealing with the second shift of the decimal point
df.loc[wrong_mortality_indexes, 'Adult Mortality'] = df.loc[wrong_mortality_indexes, 'Adult Mortality'] * 10
fig, ax = plt.subplots()
ax.scatter(x = df['Life expectancy'], y = df['BMI'])
plt.ylabel('BMI', fontsize=13)
plt.xlabel('Life expectancy', fontsize=13)
plt.show()
SELECT *
FROM df
WHERE BMI<10
df_clean = df
df_clean = df_clean.drop(df_clean[df_clean['BMI']<10].index)
df_clean = df_clean.drop(df_clean[df_clean['BMI']>40].index)
df_clean['GDP per capita'] = df_clean['GDP']/df_clean['Population']
df_clean.columns[df_clean.isna().any()].tolist()
df_clean = df_clean.fillna(df_clean.median())
target = df_clean['Life expectancy']
data = df_clean.drop(columns=['Life expectancy', 'Thinness 5-9 years', 'Percentage expenditure', 'Under-five deaths'])
numeric_cols = [
"Year",
"Adult Mortality",
"Infant deaths",
"Alcohol",
"Hepatitis B",
"Measles",
"BMI",
"Polio",
"Total expenditure",
"Diphtheria",
"HIV/AIDS",
"GDP",
"Population",
"Thinness 1-19 years",
"Income composition of resources",
"Schooling",
"GDP per capita",
]
normalized_numeric_cols = pd.DataFrame(
RobustScaler().fit_transform(data[numeric_cols]), columns=numeric_cols
)
normalized_numeric_cols.shape
categorical_cols = [
'Status',
'Country',
]
one_hot_cols = pd.get_dummies(data[categorical_cols]).reset_index(drop=True)
one_hot_cols.shape
unchanged_cols = [c for c in data.columns if c not in numeric_cols and c not in categorical_cols]
other_cols = data[unchanged_cols].reset_index(drop=True)
ready_data = pd.concat([normalized_numeric_cols, one_hot_cols, other_cols], axis=1)
Xtrain, Xtest, ytrain, ytest = train_test_split(ready_data, target, test_size=0.1, random_state=42)
lin_reg = LinearRegression()
lin_reg.fit(Xtrain, ytrain)
ridge_reg = Ridge(alpha=0.001)
ridge_reg.fit(Xtrain, ytrain)
lasso_reg = LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10])
lasso_reg.fit(Xtrain, ytrain)
lin_preds = lin_reg.predict(Xtest)
ridge_preds = ridge_reg.predict(Xtest)
lasso_preds = lasso_reg.predict(Xtest)
lin_mae = mean_absolute_error(ytest, lin_preds)
ridge_mae = mean_absolute_error(ytest, ridge_preds)
lasso_mae = mean_absolute_error(ytest, lasso_preds)
print(lin_mae, ridge_mae, lasso_mae)