import janitor
import matplotlib.pyplot as plt
import missingno
import nhanes.load
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import session_info
import sklearn.compose
import sklearn.impute
import sklearn.preprocessing
import statsmodels.api as sm
import statsmodels.datasets
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import BayesianRidge, Ridge
from sklearn.neighbors import KNeighborsRegressor
from statsmodels.graphics.mosaicplot import mosaic
%run pandas-missing-extension.ipynb
%matplotlib inline
sns.set(
rc={
"figure.figsize": (8, 6)
}
)
sns.set_style("whitegrid")
airquality_df = (
sm.datasets.get_rdataset("airquality")
.data
.clean_names(
case_type = "snake"
)
.add_column("year", 1973)
.assign(
date = lambda df: pd.to_datetime(df[["year", "month", "day"]])
)
.sort_values(by = "date")
.set_index("date")
)
airquality_df
(
smf.ols(
formula="temp ~ ozone",
data = airquality_df
)
.fit()
.summary()
.tables[0]
)
(
smf.ols(
formula="temp ~ ozone + solar_r",
data = airquality_df
)
.fit()
.summary()
.tables[0]
)
survival_df = sm.datasets.get_rdataset("flchain", "survival").data
# Aquí tu código...
%run download-data-and-load-it.ipynb
female_weight, male_weight = (
nhanes_df
.select_columns("gender", "weight")
.transform_column("weight", lambda x: x.isna(), elementwise = False)
.groupby("gender")
.weight
.pipe(
lambda df: (
df.get_group("Female"),
df.get_group("Male")
)
)
)
scipy.stats.ttest_ind(
a = female_weight,
b = male_weight,
alternative="two-sided"
)
fig, ax = plt.subplots(figsize=(10, 10))
(
nhanes_df.select_columns("weight", "general_health_condition", "gender")
.assign(weight=lambda df: df.weight.isna().replace([True, False], ["NA", "!NA"]))
.groupby(
["gender", "general_health_condition", "weight"], dropna=False, as_index=True
)
.size()
.pipe(
lambda df: mosaic(
data=df,
properties=lambda key: {"color": "r" if "NA" in key else "gray"},
ax=ax,
horizontal=True,
axes_label=True,
title="",
labelizer=lambda key: "",
)
)
)
ax.grid(False)
(
nhanes_df
.missing
# Wrapper for above function
# .missing_mosaic_plot()
)
nhanes_transformed_df = nhanes_df.copy(deep=True)
encoder = sklearn.preprocessing.OrdinalEncoder()
X = [["Male"], ["Female"], ["Female"]]
X
encoder.fit_transform(X)
encoder.categories_
encoder.inverse_transform([[1], [0], [0]])
categorical_columns = nhanes_df.select_dtypes(object).columns
categorical_columns
categorical_transformer = sklearn.compose.make_column_transformer(
(sklearn.preprocessing.OrdinalEncoder(), categorical_columns),
remainder="passthrough"
)
nhanes_transformed_df = (
pd.DataFrame(
categorical_transformer.fit_transform(nhanes_df),
columns = categorical_transformer.get_feature_names_out(),
index = nhanes_df.index
)
.rename_columns(
function = lambda x: x.removeprefix("ordinalencoder__")
)
.rename_columns(
function = lambda x: x.removeprefix("onehotencoder__")
)
.rename_columns(
function = lambda x: x.removeprefix("remainder__")
)
)
nhanes_transformed_df
# gender_encoder = sklearn.preprocessing.OrdinalEncoder()
# gender_values = (
# nhanes_df
# .gender
# .values
# .reshape(-1, 1)
# )
# (
# gender_encoder
# .fit_transform(
# X = gender_values
# )
# .squeeze()
# )
# (
# nhanes_df.gender.unique(),
# nhanes_transformed_df.gender.unique(),
# gender_encoder.categories_
# )
nhanes_transformed_df2 = nhanes_df.copy(deep=True)
(
nhanes_transformed_df2
.select_columns("general_health_condition")
# .head(10)
.pipe(pd.get_dummies)
# .columns
)
transformer = sklearn.compose.make_column_transformer(
(sklearn.preprocessing.OrdinalEncoder(), ["gender"]),
(sklearn.preprocessing.OneHotEncoder(), ["general_health_condition"]),
remainder="passthrough"
)
nhanes_transformed_df2 = (
pd.DataFrame(
transformer.fit_transform(nhanes_df),
columns = transformer.get_feature_names_out(),
index = nhanes_df.index
)
.rename_columns(
function = lambda x: x.removeprefix("ordinalencoder__")
)
.rename_columns(
function = lambda x: x.removeprefix("onehotencoder__")
)
.rename_columns(
function = lambda x: x.removeprefix("remainder__")
)
)
nhanes_transformed_df
(
transformer
.named_transformers_
.get("onehotencoder")
.categories_
)
(
transformer
.named_transformers_
.get("onehotencoder")
.inverse_transform(
X = [[0, 0, 1, 0, 0, 0]]
)
)
(
nhanes_df
.transform_column(
"height",
lambda x: x.fillna(x.mean()),
elementwise=False
)
.height
.mean()
)
plt.figure(figsize=(10, 10))
(
nhanes_df
.select_columns("height", "weight")
.missing.bind_shadow_matrix(True, False, suffix = "_imp")
.assign(
height = lambda df: df.height.fillna(value = df.height.mean()),
weight = lambda df: df.weight.fillna(value = df.weight.mean())
)
.missing.scatter_imputation_plot(x="height", y="weight")
)
(
nhanes_df.select_columns("height", "weight")
.missing.bind_shadow_matrix(True, False, suffix="_imp")
.assign(
height=lambda df: df.height.fillna(value=df.height.mean()),
weight=lambda df: df.weight.fillna(value=df.weight.mean()),
)
.missing.scatter_imputation_plot(
x="height",
y="weight",
show_marginal=True,
height=10
)
)
(
nhanes_df
.select_columns("height", "weight")
# .fillna(method = "ffill")
.ffill()
)
(
nhanes_df
.select_columns("height", "weight")
# .fillna(method = "bfill")
.bfill()
)
(
nhanes_df
.select_columns("height", "weight", "gender", "diabetes", "general_health_condition")
.sort_values(
by = ["gender", "diabetes", "general_health_condition", "height"],
ascending = True
)
.transform_column(
"weight",
lambda x: x.ffill(),
elementwise = False
)
)
plt.figure(figsize=(20,10))
(
airquality_df
.select_columns("ozone")
.pipe(
lambda df: (
df.ozone.plot(color = "#313638", marker = "o")
)
)
)
plt.figure(figsize=(20,10))
(
airquality_df
.select_columns("ozone")
.pipe(
lambda df: (
df.ozone.ffill().plot(color = "red", marker = "o", alpha=6/9, linestyle = "dashed"),
df.ozone.plot(color = "#313638", marker = "o")
)
)
)
plt.figure(figsize=(20,10))
(
airquality_df
.select_columns("ozone")
.pipe(
lambda df: (
df.ozone.bfill().plot(color = "red", marker = "o", alpha=6/9, linestyle = "dashed"),
df.ozone.plot(color = "#313638", marker = "o")
)
)
)
plt.figure(figsize=(20,10))
(
airquality_df
.select_columns("ozone")
.pipe(
lambda df: (
df.ozone.interpolate(method = "linear").plot(color = "red", marker = "o", alpha=6/9, linestyle = "dashed"),
df.ozone.plot(color = "#313638", marker = "o")
)
)
)
plt.figure(figsize=(20,10))
(
airquality_df
.select_columns("ozone")
.pipe(
lambda df: (
df.ozone.interpolate(method = "quadratic").plot(color = "red", marker = "o", alpha=6/9, linestyle = "dashed"),
df.ozone.plot(color = "#313638", marker = "o")
)
)
)
plt.figure(figsize=(20,10))
(
airquality_df
.select_columns("ozone")
.pipe(
lambda df: (
df.ozone.interpolate(method = "nearest").plot(color = "red", marker = "o", alpha=6/9, linestyle = "dashed"),
df.ozone.plot(color = "#313638", marker = "o")
)
)
)
nhanes_df.head()
nhanes_transformed_df.head()
knn_imputer = sklearn.impute.KNNImputer()
nhanes_df_knn = nhanes_transformed_df.copy(deep=True)
nhanes_df_knn.iloc[:, :] = knn_imputer.fit_transform(nhanes_df_knn).round()
nhanes_df_knn
(
pd.concat(
[
nhanes_df_knn,
nhanes_df.missing.create_shadow_matrix(True, False, suffix="_imp", only_missing=True)
],
axis=1
)
.missing.scatter_imputation_plot(x="height", y="weight")
)
knn_imputer = sklearn.impute.KNNImputer(n_neighbors = 10)
nhanes_df_knn = nhanes_transformed_df.missing.sort_variables_by_missingness(ascending=True).copy(deep=True)
nhanes_df_knn.iloc[:, :] = knn_imputer.fit_transform(nhanes_df_knn).round()
(
pd.concat(
[
nhanes_df_knn,
nhanes_df.missing.create_shadow_matrix(True, False, suffix="_imp", only_missing=True)
],
axis=1
)
.missing.scatter_imputation_plot(x="height", y="weight")
)
nhanes_model_df = (
nhanes_df
.select_columns("height", "weight", "gender", "age")
.sort_values(by = "height")
.transform_column(
"weight",
lambda x: x.ffill(),
elementwise = False
)
.missing.bind_shadow_matrix(
True,
False,
suffix = "_imp"
)
)
nhanes_model_df
height_ols = (
nhanes_model_df
.pipe(
lambda df: smf.ols("height ~ weight + gender + age", data=df)
)
.fit()
)
ols_imputed_values = (
nhanes_model_df
.pipe(
lambda df: df[df.height.isna()]
)
.pipe(
lambda df: height_ols.predict(df).round()
)
)
ols_imputed_values
nhanes_model_df.loc[nhanes_model_df.height.isna(), ["height"]] = ols_imputed_values
(
nhanes_model_df
.missing
.scatter_imputation_plot(
x = "height",
y = "weight"
)
)
mice_imputer = sklearn.impute.IterativeImputer(estimator=BayesianRidge())
nhanes_mice_df = nhanes_transformed_df.copy(deep = True)
nhanes_mice_df.iloc[:, :] = mice_imputer.fit_transform(nhanes_mice_df).round()
nhanes_mice_df = pd.concat(
[
nhanes_mice_df,
nhanes_df.missing.create_shadow_matrix(True, False, suffix="_imp", only_missing=True)
],
axis = 1
)
nhanes_mice_df.missing.scatter_imputation_plot(
x = "height",
y = "weight"
)
nhanes_imputated_df = nhanes_mice_df.copy(deep=True)
nhanes_imputated_df[categorical_columns] = (
categorical_transformer
.named_transformers_
.ordinalencoder
.inverse_transform(
X = nhanes_mice_df[categorical_columns]
)
)
nhanes_imputated_df
nhanes_df.general_health_condition.value_counts()
nhanes_imputated_df.general_health_condition.value_counts()
nhanes_mice_df.missing.number_missing()
session_info.show()