import janitor
import matplotlib.pyplot as plt
import missingno
import nhanes.load
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import session_info
import sklearn.compose
import sklearn.impute
import sklearn.preprocessing
import statsmodels.api as sm
import statsmodels.datasets
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import BayesianRidge, Ridge
from sklearn.neighbors import KNeighborsRegressor
from statsmodels.graphics.mosaicplot import mosaic
%run ../handling_missing_data/utils/pandas-missing-extension.py
%matplotlib inline
sns.set(
rc={
"figure.figsize": (8, 6)
}
)
sns.set_style("whitegrid")
airquality_df = (
sm.datasets.get_rdataset('airquality')
.data
.clean_names(
case_type = 'snake'
)
.add_column('year',1973)
.assign(
date = lambda df: pd.to_datetime(df[['year','month','day']])
)
.sort_values(by = 'date')
.set_index('date')
)
airquality_df
# temp ~ ozone
(
smf.ols(
formula= 'temp ~ ozone',
data=airquality_df
)
.fit()
.summary()
.tables[0]
)
# temp ~ ozone + solar_r
(
smf.ols(
formula= 'temp ~ ozone + solar_r',
data=airquality_df
)
.fit()
.summary()
.tables[0]
)
survival_df = sm.datasets.get_rdataset("flchain", "survival").data
female_weight, male_weight =(
nhanes_df
.select_columns('gender','weight')
.transform_column(
'weight',
lambda x: x.isna(),
elementwise= False
)
.groupby('gender')
.weight
.pipe(
lambda df: (
df.get_group('Female'),
df.get_group('Male')
)
)
)
scipy.stats.ttest_ind(
a=female_weight,
b=male_weight,
alternative='two-sided'
)
fig, ax = plt.subplots(figsize=(10, 10))
(
nhanes_df.select_columns("weight", "general_health_condition", "gender")
.assign(weight=lambda df: df.weight.isna().replace([True, False], ["NA", "!NA"]))
.groupby(
["gender", "general_health_condition", "weight"], dropna=False, as_index=True
)
.size()
.pipe(
lambda df: mosaic(
data=df,
properties=lambda key: {"color": "r" if "NA" in key else "gray"},
ax=ax,
horizontal=True,
axes_label=True,
title="",
labelizer=lambda key: "",
)
)
)
ax.grid(False)
nhanes_transformed_df = nhanes_df.copy(deep=True)
encoder = sklearn.preprocessing.OrdinalEncoder()
X = [ ["Male"], ['Female'], ['Female']]
X
encoder.inverse_transform([[1],[0],[0],[1]])
categorical_columns = nhanes_df.select_dtypes(include=[object, 'category']).columns
categorical_transformer = sklearn.compose.make_column_transformer(
(sklearn.preprocessing.OrdinalEncoder(), categorical_columns),
remainder='passthrough'
)
# Varias Categorias
nhanes_transformed_df = (
pd.DataFrame(
categorical_transformer.fit_transform(nhanes_df),
columns=categorical_transformer.get_feature_names_out(),
index=nhanes_df.index
)
.rename_columns(
function = lambda x: x.removeprefix('ordinalencoder__')
)
.rename_columns(
function = lambda x: x.removeprefix('remainder__')
)
)
nhanes_transformed_df
nhanes_transformed_df2 = nhanes_df.copy(deep=True)
(
nhanes_transformed_df2
.select_columns('general_health_condition')
.pipe(pd.get_dummies)
)
nhanes_transformed_df2 = (
pd.DataFrame(
transformer.fit_transform(nhanes_df),
columns=transformer.get_feature_names_out(),
index=nhanes_df.index
)
.rename_columns(
function = lambda x: x.removeprefix('ordinalencoder__')
)
.rename_columns(
function = lambda x: x.removeprefix('remainder__')
)
.rename_columns(
function = lambda x: x.removeprefix('onehotencoder__')
)
)
nhanes_transformed_df2
print(
(
transformer
.named_transformers_
.get('ordinalencoder')
.categories_
),
(
transformer
.named_transformers_
.get('onehotencoder')
.categories_
),
sep='\n'
)
(
nhanes_df
.transform_column(
'height',
lambda x: x.fillna(x.mean()),
elementwise=False
)
.height
.mean()
)
(
nhanes_df
.select_columns('height','weight')
.missing.bind_shadow_matrix(True,False, suffix='_imp')
.assign(
height = lambda df: df.height.fillna(value = df.height.median()),
weight = lambda df: df.weight.fillna(value = df.weight.median())
)
.missing.scatter_imputation_plot(
x='height',
y='weight',
show_marginal = True,
height = 10
)
);
(
nhanes_df
.select_columns('height','weight')
# .fillna(method = 'bfill')
.bfill()
)
(
nhanes_df
.select_columns('height','weight')
# .fillna(method = 'ffill')
.ffill()
)
plt.figure(figsize=(15,10))
(
airquality_df
.select_columns('ozone')
.pipe(
lambda df: (
df.ozone.plot(color='#313638', marker='o')
)
)
)
plt.figure(figsize=(20,10))
(
airquality_df
.select_columns('ozone')
.pipe(
lambda df: (
df.ozone.ffill().plot(color='#ff5447',marker='o',alpha=6/9,linestyle='dashed'),
df.ozone.bfill().plot(color='#e647ff',marker='o',alpha=6/9,linestyle='dashed'),
df.ozone.plot(color='#313638', marker='o')
)
)
)
plt.legend(['forward fill','backward fill','ozone'])
plt.figure(figsize=(20,10))
(
airquality_df
.select_columns('ozone')
.pipe(
lambda df: (
df.ozone.interpolate(method='linear').plot(color='r',marker='o',linestyle='dashed'),
df.ozone.interpolate(method='akima').plot(color='#e647ff',marker='o',linestyle='dashed'),
df.ozone.plot(color='#313638', marker='o')
)
)
);
plt.legend([
'linear',
'akima',
]);
nhanes_transformed_df
knn_inputer = sklearn.impute.KNNImputer()
nhanes_df_knn = nhanes_transformed_df.missing.sort_variables_by_missingness(ascending=True).copy(deep=True)
nhanes_df_knn.iloc[:,:] = knn_inputer.fit_transform(nhanes_transformed_df.missing.sort_variables_by_missingness(ascending=True)).round()
nhanes_model_df = (
nhanes_df
.select_columns('height', 'weight','gender', 'age',)
.sort_values(by='height',ascending=True)
.transform_column(
'weight',
lambda x: x.ffill(),
elementwise=False
)
.missing.bind_shadow_matrix(
True,
False,
suffix='_imp',
only_missing=False
)
)
nhanes_model_df
height_ols = (
nhanes_model_df
.pipe(
lambda df: smf.ols(
formula="height ~ weight + gender + age",
data=df
)
.fit()
)
)
mice_imputer = sklearn.impute.IterativeImputer(
estimator=BayesianRidge(),
initial_strategy='mean',
imputation_order='ascending',
)
nhanes_mice_df = nhanes_transformed_df.copy(deep=True)
nhanes_mice_df.iloc[:,:] = mice_imputer.fit_transform(nhanes_transformed_df).round()
nhanes_mice_df
nhanes_imputed_df = nhanes_mice_df.copy(deep=True)
nhanes_df.general_health_condition.value_counts()
nhanes_imputed_df.general_health_condition.value_counts()
session_info.show()