8.1 Imputacion de datos faltantes

import janitor import matplotlib.pyplot as plt import missingno import nhanes.load import numpy as np import pandas as pd import scipy.stats import seaborn as sns import session_info import sklearn.compose import sklearn.impute import sklearn.preprocessing import statsmodels.api as sm import statsmodels.datasets import statsmodels.formula.api as smf from sklearn.ensemble import RandomForestRegressor from sklearn.experimental import enable_iterative_imputer from sklearn.kernel_approximation import Nystroem from sklearn.linear_model import BayesianRidge, Ridge from sklearn.neighbors import KNeighborsRegressor from statsmodels.graphics.mosaicplot import mosaic

%run pandas-missing-extension.ipynb

%matplotlib inline sns.set( rc={ "figure.figsize": (8, 6) } ) sns.set_style("whitegrid")

airquality_df = ( sm.datasets.get_rdataset("airquality") .data .clean_names( case_type = "snake" ) .add_column("year", 1973) .assign( date = lambda df: pd.to_datetime(df[["year", "month", "day"]]) ) .sort_values(by = "date") .set_index("date") ) airquality_df

( smf.ols( formula = "temp ~ ozone", data = airquality_df ) .fit() .summary() .tables[0] )

( smf.ols( formula = "temp ~ ozone + solar_r", data = airquality_df ) .fit() .summary() .tables[0] )

female_weight, male_weight = ( nhanes_df .select_columns("gender", "weight") .transform_column( "weight", lambda x: x.isna(), elementwise = False ) .groupby("gender") .weight .pipe( lambda df: ( df.get_group("Female"), df.get_group("Male") ) ) ) # Ahora se hace una prueba estadistica para determinar si hay o no diferencia en la presencia # o ausencia de valores de peso scipy.stats.ttest_ind( a = female_weight, b = male_weight, alternative="two-sided" )

# Primero se realiza una copia del dataframe original nhanes_transformed_df = nhanes_df.copy(deep=True)

encoder = sklearn.preprocessing.OrdinalEncoder() X = [["Male"], ["Female"], ["Female"]] X

encoder.fit_transform(X)

encoder.categories_

categorical_columns = nhanes_df.select_dtypes(include=[object, "category"]).columns

categorical_transformer = sklearn.compose.make_column_transformer( (sklearn.preprocessing.OrdinalEncoder(), categorical_columns), remainder="passthrough" #aqui le dices que no modifique las variables que no son categoricas )

nhanes_transformed_df = ( pd.DataFrame( categorical_transformer.fit_transform(nhanes_df), columns = categorical_transformer.get_feature_names_out(), index = nhanes_df.index ) .rename_columns( function = lambda x: x.removeprefix("ordinalencoder__") ) .rename_columns( function = lambda x: x.removeprefix("remainder__") ) ) nhanes_transformed_df

nhanes_transformed_df2 = nhanes_df.copy(deep=True)

( nhanes_transformed_df2 .select_columns("general_health_condition") .pipe(pd.get_dummies) )

( transformer .named_transformers_ .get("onehotencoder") .categories_ )

( transformer .named_transformers_ .get("onehotencoder") .inverse_transform( X = [[0, 0, 1, 0, 0, 0]] ) )

( nhanes_df # janitor .transform_column( "height", lambda x: x.fillna(x.mean()), elementwise=False ) )

( nhanes_df .select_columns("height", "weight") .missing.bind_shadow_matrix(True,False, suffix = "_imp") .assign( height = lambda df: df.height.fillna(value = df.height.mean()), weight = lambda df: df.weight.fillna(value = df.weight.mean()) ) .missing.scatter_imputation_plot( x = "height", y = "weight" ) )

( nhanes_df .select_columns("height", "weight") .missing.bind_shadow_matrix(True,False, suffix = "_imp") .assign( height = lambda df: df.height.fillna(value = df.height.mean()), weight = lambda df: df.weight.fillna(value = df.weight.mean()) ) .missing.scatter_imputation_plot( x = "height", y = "weight", show_marginal= True, height = 10 ) )

( nhanes_df .select_columns("height", "weight") # .fillna(method = "ffill") .ffill() )

( nhanes_df .select_columns("height", "weight") # .fillna(method = "bfill") .bfill() )

( nhanes_df .select_columns("height", "weight", "gender", "diabetes", "general_health_condition") .sort_values( by = ["gender", "diabetes", "general_health_condition", "height"], ascending = True ) .transform_column( "weight", lambda x: x.ffill(), elementwise = False ) )

( airquality_df .select_columns("ozone") .pipe( lambda df: ( df.ozone.plot(color = "#313638", marker= "o") ) ) )

plt.figure(figsize=(20,10)) ( airquality_df .select_columns("ozone") .pipe( lambda df: ( df.ozone.ffill().plot(color= "red", marker="o", alpha=6/9, linestyle="dashed"), df.ozone.plot(color = "#313638", marker= "o") ) ) )

plt.figure(figsize=(20,10)) ( airquality_df .select_columns("ozone") .pipe( lambda df: ( df.ozone.bfill().plot(color= "red", marker="o", alpha=6/9, linestyle="dashed"), df.ozone.plot(color = "#313638", marker= "o") ) ) )

plt.figure(figsize=(20,10)) ( airquality_df .select_columns("ozone") .pipe( lambda df: ( df.ozone.interpolate(method = "nearest").plot(color= "red", marker="o", alpha=6/9, linestyle="dashed"), df.ozone.plot(color = "#313638", marker= "o") ) ) )

plt.figure(figsize=(20,10)) ( airquality_df .select_columns("ozone") .pipe( lambda df: ( df.ozone.interpolate(method = "linear").plot(color= "red", marker="o", alpha=6/9, linestyle="dashed"), df.ozone.plot(color = "#313638", marker= "o") ) ) )

airquality_df["ozone"] = airquality_df.ozone.interpolate(method="linear")

knn_imputer = sklearn.impute.KNNImputer() #copia del df original nhanes_df_knn = nhanes_transformed_df.copy(deep=True) nhanes_df_knn.iloc[:, :] = knn_imputer.fit_transform(nhanes_transformed_df).round() nhanes_df_knn

( pd.concat( [ nhanes_df_knn, nhanes_df.missing.create_shadow_matrix(True, False, suffix="_imp", only_missing=True) ], axis=1 ) )

( pd.concat( [ nhanes_df_knn, nhanes_df.missing.create_shadow_matrix(True, False, suffix="_imp", only_missing=True) ], axis=1 ) .missing.scatter_imputation_plot( x = "height", y = "weight" ) )

knn_imputer = sklearn.impute.KNNImputer() #copia del df original nhanes_df_knn = nhanes_transformed_df.missing.sort_variables_by_missingness(ascending=True).copy(deep=True) nhanes_df_knn.iloc[:, :] = knn_imputer.fit_transform(nhanes_transformed_df.missing.sort_variables_by_missingness(ascending=True)).round() nhanes_df_knn

nhanes_model_df = ( nhanes_df .select_columns("height", "weight", "gender", "age") .sort_values("weight") .transform_column( "weight", lambda x: x.ffill(), elementwise = False ) .missing.bind_shadow_matrix( True, False, suffix="_imp", only_missing = False ) ) nhanes_model_df

height_ols = ( nhanes_model_df .pipe( lambda df: smf.ols("height ~ weight + age", data=df) ) .fit() )

ols_imputed_values = ( nhanes_model_df .pipe( lambda df: df[df.height.isna()] ) .pipe( lambda df: height_ols.predict(df).round() ) ) ols_imputed_values

nhanes_model_df.loc[nhanes_model_df.height.isna(), ["height"]] = ols_imputed_values nhanes_model_df

( nhanes_model_df .missing.scatter_imputation_plot( x = "weight", y = "height" ) )

mice_imputer = sklearn.impute.IterativeImputer( estimator=BayesianRidge(), initial_strategy="mean", imputation_order="ascending" ) nhanes_mice_df = nhanes_transformed_df.copy(deep=True) nhanes_mice_df.iloc[:, :] = mice_imputer.fit_transform(nhanes_transformed_df).round() nhanes_mice_df = pd.concat( [ nhanes_mice_df, nhanes_df.missing.create_shadow_matrix(True, False, suffix="_imp") ], axis=1 ) nhanes_mice_df

nhanes_imputed_df = nhanes_mice_df.copy(deep=True)

nhanes_imputed_df[categorical_columns] = ( categorical_transformer .named_transformers_ .ordinalencoder .inverse_transform( X = nhanes_mice_df[categorical_columns] ) ) nhanes_imputed_df

nhanes_df.general_health_condition.value_counts()

nhanes_imputed_df.general_health_condition.value_counts()