import janitor
import matplotlib.pyplot as plt
import missingno
import nhanes.load
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import session_info
import sklearn.compose
import sklearn.impute
import sklearn.preprocessing
import statsmodels.api as sm
import statsmodels.datasets
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import BayesianRidge, Ridge
from sklearn.neighbors import KNeighborsRegressor
from statsmodels.graphics.mosaicplot import mosaic
%run pandas-missing-extension.ipynb
%matplotlib inline
sns.set(
rc={
"figure.figsize": (8, 6)
}
)
sns.set_style("whitegrid")
airquality_df = (
sm.datasets.get_rdataset("airquality")
.data
.clean_names(
case_type = "snake"
)
.add_column("year", 1973)
.assign(
date = lambda df: pd.to_datetime(df[["year", "month", "day"]])
)
.sort_values(by = "date")
.set_index("date")
)
airquality_df
(
smf.ols(
formula = "temp ~ ozone",
data = airquality_df
)
.fit()
.summary()
.tables[0]
)
(
smf.ols(
formula = "temp ~ ozone + solar_r",
data = airquality_df
)
.fit()
.summary()
.tables[0]
)
female_weight, male_weight = (
nhanes_df
.select_columns("gender", "weight")
.transform_column(
"weight",
lambda x: x.isna(),
elementwise = False
)
.groupby("gender")
.weight
.pipe(
lambda df: (
df.get_group("Female"),
df.get_group("Male")
)
)
)
# Ahora se hace una prueba estadistica para determinar si hay o no diferencia en la presencia
# o ausencia de valores de peso
scipy.stats.ttest_ind(
a = female_weight,
b = male_weight,
alternative="two-sided"
)
# Primero se realiza una copia del dataframe original
nhanes_transformed_df = nhanes_df.copy(deep=True)
encoder = sklearn.preprocessing.OrdinalEncoder()
X = [["Male"], ["Female"], ["Female"]]
X
encoder.fit_transform(X)
encoder.categories_
categorical_columns = nhanes_df.select_dtypes(include=[object, "category"]).columns
categorical_transformer = sklearn.compose.make_column_transformer(
(sklearn.preprocessing.OrdinalEncoder(), categorical_columns),
remainder="passthrough"
#aqui le dices que no modifique las variables que no son categoricas
)
nhanes_transformed_df = (
pd.DataFrame(
categorical_transformer.fit_transform(nhanes_df),
columns = categorical_transformer.get_feature_names_out(),
index = nhanes_df.index
)
.rename_columns(
function = lambda x: x.removeprefix("ordinalencoder__")
)
.rename_columns(
function = lambda x: x.removeprefix("remainder__")
)
)
nhanes_transformed_df
nhanes_transformed_df2 = nhanes_df.copy(deep=True)
(
nhanes_transformed_df2
.select_columns("general_health_condition")
.pipe(pd.get_dummies)
)
(
transformer
.named_transformers_
.get("onehotencoder")
.categories_
)
(
transformer
.named_transformers_
.get("onehotencoder")
.inverse_transform(
X = [[0, 0, 1, 0, 0, 0]]
)
)
(
nhanes_df
# janitor
.transform_column(
"height",
lambda x: x.fillna(x.mean()),
elementwise=False
)
)
(
nhanes_df
.select_columns("height", "weight")
.missing.bind_shadow_matrix(True,False, suffix = "_imp")
.assign(
height = lambda df: df.height.fillna(value = df.height.mean()),
weight = lambda df: df.weight.fillna(value = df.weight.mean())
)
.missing.scatter_imputation_plot(
x = "height",
y = "weight"
)
)
(
nhanes_df
.select_columns("height", "weight")
.missing.bind_shadow_matrix(True,False, suffix = "_imp")
.assign(
height = lambda df: df.height.fillna(value = df.height.mean()),
weight = lambda df: df.weight.fillna(value = df.weight.mean())
)
.missing.scatter_imputation_plot(
x = "height",
y = "weight",
show_marginal= True,
height = 10
)
)
(
nhanes_df
.select_columns("height", "weight")
# .fillna(method = "ffill")
.ffill()
)
(
nhanes_df
.select_columns("height", "weight")
# .fillna(method = "bfill")
.bfill()
)
(
nhanes_df
.select_columns("height", "weight", "gender", "diabetes", "general_health_condition")
.sort_values(
by = ["gender", "diabetes", "general_health_condition", "height"],
ascending = True
)
.transform_column(
"weight",
lambda x: x.ffill(),
elementwise = False
)
)
(
airquality_df
.select_columns("ozone")
.pipe(
lambda df: (
df.ozone.plot(color = "#313638", marker= "o")
)
)
)
plt.figure(figsize=(20,10))
(
airquality_df
.select_columns("ozone")
.pipe(
lambda df: (
df.ozone.ffill().plot(color= "red", marker="o", alpha=6/9, linestyle="dashed"),
df.ozone.plot(color = "#313638", marker= "o")
)
)
)
plt.figure(figsize=(20,10))
(
airquality_df
.select_columns("ozone")
.pipe(
lambda df: (
df.ozone.bfill().plot(color= "red", marker="o", alpha=6/9, linestyle="dashed"),
df.ozone.plot(color = "#313638", marker= "o")
)
)
)
plt.figure(figsize=(20,10))
(
airquality_df
.select_columns("ozone")
.pipe(
lambda df: (
df.ozone.interpolate(method = "nearest").plot(color= "red", marker="o", alpha=6/9, linestyle="dashed"),
df.ozone.plot(color = "#313638", marker= "o")
)
)
)
plt.figure(figsize=(20,10))
(
airquality_df
.select_columns("ozone")
.pipe(
lambda df: (
df.ozone.interpolate(method = "linear").plot(color= "red", marker="o", alpha=6/9, linestyle="dashed"),
df.ozone.plot(color = "#313638", marker= "o")
)
)
)
airquality_df["ozone"] = airquality_df.ozone.interpolate(method="linear")
knn_imputer = sklearn.impute.KNNImputer()
#copia del df original
nhanes_df_knn = nhanes_transformed_df.copy(deep=True)
nhanes_df_knn.iloc[:, :] = knn_imputer.fit_transform(nhanes_transformed_df).round()
nhanes_df_knn
(
pd.concat(
[
nhanes_df_knn,
nhanes_df.missing.create_shadow_matrix(True, False, suffix="_imp", only_missing=True)
],
axis=1
)
)
(
pd.concat(
[
nhanes_df_knn,
nhanes_df.missing.create_shadow_matrix(True, False, suffix="_imp", only_missing=True)
],
axis=1
)
.missing.scatter_imputation_plot(
x = "height",
y = "weight"
)
)
knn_imputer = sklearn.impute.KNNImputer()
#copia del df original
nhanes_df_knn = nhanes_transformed_df.missing.sort_variables_by_missingness(ascending=True).copy(deep=True)
nhanes_df_knn.iloc[:, :] = knn_imputer.fit_transform(nhanes_transformed_df.missing.sort_variables_by_missingness(ascending=True)).round()
nhanes_df_knn
nhanes_model_df = (
nhanes_df
.select_columns("height", "weight", "gender", "age")
.sort_values("weight")
.transform_column(
"weight",
lambda x: x.ffill(),
elementwise = False
)
.missing.bind_shadow_matrix(
True,
False,
suffix="_imp",
only_missing = False
)
)
nhanes_model_df
height_ols = (
nhanes_model_df
.pipe(
lambda df: smf.ols("height ~ weight + age", data=df)
)
.fit()
)
ols_imputed_values = (
nhanes_model_df
.pipe(
lambda df: df[df.height.isna()]
)
.pipe(
lambda df: height_ols.predict(df).round()
)
)
ols_imputed_values
nhanes_model_df.loc[nhanes_model_df.height.isna(), ["height"]] = ols_imputed_values
nhanes_model_df
(
nhanes_model_df
.missing.scatter_imputation_plot(
x = "weight",
y = "height"
)
)
mice_imputer = sklearn.impute.IterativeImputer(
estimator=BayesianRidge(),
initial_strategy="mean",
imputation_order="ascending"
)
nhanes_mice_df = nhanes_transformed_df.copy(deep=True)
nhanes_mice_df.iloc[:, :] = mice_imputer.fit_transform(nhanes_transformed_df).round()
nhanes_mice_df = pd.concat(
[
nhanes_mice_df,
nhanes_df.missing.create_shadow_matrix(True, False, suffix="_imp")
],
axis=1
)
nhanes_mice_df
nhanes_imputed_df = nhanes_mice_df.copy(deep=True)
nhanes_imputed_df[categorical_columns] = (
categorical_transformer
.named_transformers_
.ordinalencoder
.inverse_transform(
X = nhanes_mice_df[categorical_columns]
)
)
nhanes_imputed_df
nhanes_df.general_health_condition.value_counts()
nhanes_imputed_df.general_health_condition.value_counts()