import janitor
import matplotlib.pyplot as plt
import missingno
import numpy as np
import pandas as pd
import pyreadr
import seaborn as sns
import session_info
import upsetplot
# Fix for missing notebook import: replace %run with a safe import pattern.
# We will try to load a local python module if available, otherwise define minimal stubs.
# Original intent: %run pandas-missing-extension.ipynb
# That file is not present. We'll attempt to import a module named pandas_missing_extension
# which could provide similar helpers. If not available, we proceed without it.
try:
import importlib
_pme = importlib.import_module('pandas_missing_extension')
except Exception:
# Define minimal no-op helpers to avoid breaking later code that might expect these names.
class _PME:
def __init__(self):
pass
_pme = _PME()
# Proceed with the rest of the notebook imports and environment setup as per previous cells.
import janitor
import matplotlib.pyplot as plt
import missingno
import numpy as np
import pandas as pd
import pyreadr
import seaborn as sns
import session_info
import upsetplot
print("Entorno cargado. Extensión personalizada:", 'cargada' if not isinstance(_pme, _PME) else 'no disponible')
%matplotlib inline
sns.set(
rc={
"figure.figsize": (10, 10)
}
)
sns.set_style("whitegrid")
print(
None or True,
None or False,
None == None,
None is None,
# None + True,
# None / False,
type(None),
sep="\n"
)
print(
np.nan or True,
np.nan or False,
np.nan == np.nan,
np.nan is np.nan,
np.nan / 2,
np.nan * 7,
type(np.nan),
np.isnan(np.nan),
sep="\n"
)
test_missing_df = pd.DataFrame.from_dict(
data=dict(
x=[0, 1, np.nan, np.nan, None],
y=[0, 1, pd.NA, np.nan, None]
)
)
test_missing_df
test_missing_df.isna()
test_missing_df.isnull()
test_missing_df.x.isnull()
pd.Series([1, np.nan])
pd.Series([pd.to_datetime("2022-01-01"), np.nan])
pd.Series([-1]).isnull()
pima_indians_diabetes_url = "https://nrvis.com/data/mldata/pima-indians-diabetes.csv"
!wget -O ./data/pima-indians-diabetes.csv { pima_indians_diabetes_url } -q
diabetes_df = pd.read_csv(
filepath_or_buffer="./data/pima-indians-diabetes.csv", # or pima_indians_diabetes_url
sep=",",
names=[
"pregnancies",
"glucose",
"blood_pressure",
"skin_thickness",
"insulin",
"bmi",
"diabetes_pedigree_function",
"age",
"outcome",
]
)
base_url = "https://github.com/njtierney/naniar/raw/master/data/"
datasets_names = ("oceanbuoys", "pedestrian", "riskfactors")
extension = ".rda"
datasets_dfs = {}
for dataset_name in datasets_names:
dataset_file = f"{ dataset_name }{ extension }"
dataset_output_file = f"./data/{ dataset_file }"
dataset_url = f"{ base_url }{ dataset_file }"
!wget -q -O { dataset_output_file } { dataset_url }
datasets_dfs[f"{ dataset_name }_df"] = pyreadr.read_r(dataset_output_file).get(dataset_name)
datasets_dfs.keys()
locals().update(**datasets_dfs)
del datasets_dfs
oceanbuoys_df.shape, pedestrian_df.shape, riskfactors_df.shape, diabetes_df.shape
riskfactors_df.info()
riskfactors_df.isna()
riskfactors_df.shape
riskfactors_df.missing.number_complete()
riskfactors_df.missing.number_missing()
riskfactors_df.missing.missing_variable_summary()
riskfactors_df.missing.missing_variable_table()
riskfactors_df.missing.missing_case_summary()
riskfactors_df.missing.missing_case_table()
(
riskfactors_df
.missing
.missing_variable_span(
variable="weight_lbs",
span_every=50
)
)
(
riskfactors_df
.missing
.missing_variable_run(
variable="weight_lbs"
)
)
riskfactors_df.missing.number_complete()
riskfactors_df.missing.number_missing()
riskfactors_df.missing.missing_variable_summary()
riskfactors_df.missing.missing_variable_table()
riskfactors_df.missing.missing_case_summary()
riskfactors_df.missing.missing_case_table()
(
riskfactors_df
.missing
.missing_variable_span(
variable="weight_lbs",
span_every=50
)
)
(
riskfactors_df
.missing
.missing_variable_run(
variable="weight_lbs"
)
)
riskfactors_df.missing.missing_variable_plot()
riskfactors_df.missing.missing_case_plot()
(
riskfactors_df
.missing
.missing_variable_span_plot(
variable="weight_lbs",
span_every=10,
rot=0
)
)
missingno.bar(df = riskfactors_df)
missingno.matrix(df=riskfactors_df)
(
riskfactors_df
.missing
.missing_upsetplot(
variables = None,
element_size = 60
)
)
common_na_strings = (
"missing",
"NA",
"N A",
"N/A",
"#N/A",
"NA ",
" NA",
"N /A",
"N / A",
" N / A",
"N / A ",
"na",
"n a",
"n/a",
"na ",
" na",
"n /a",
"n / a",
" a / a",
"n / a ",
"NULL",
"null",
"",
"?",
"*",
".",
)
common_na_numbers = (-9, -99, -999, -9999, 9999, 66, 77, 88, -1)
missing_data_example_df = pd.DataFrame.from_dict(
dict(
x = [1, 3, "NA", -99, -98, -99],
y = ["A", "N/A", "NA", "E", "F", "G"],
z = [-100, -99, -98, -101, -1, -1]
)
)
missing_data_example_df
missing_data_example_df.missing.number_missing()
missing_data_example_df.dtypes
missing_data_example_df.x.unique()
(
missing_data_example_df
.select_dtypes(object)
.apply(pd.unique)
)
pd.read_csv(
"./data/missing_data_enconding_example.csv",
na_filter=True,
na_values=[-99, -1]
)
(
missing_data_example_df
.replace(
to_replace=[-99, "NA"],
value=np.nan
)
)
(
missing_data_example_df
.replace(
to_replace={
"x": {
-99: np.nan
}
}
)
)
implicit_to_explicit_df = pd.DataFrame.from_dict(
data={
"name": ["lynn", "lynn", "lynn", "zelda"],
"time": ["morning", "afternoon", "night", "morning"],
"value": [350, 310, np.nan, 320]
}
)
implicit_to_explicit_df
(
implicit_to_explicit_df
.pivot_wider(
index="name",
names_from="time",
values_from="value"
)
)
(
implicit_to_explicit_df
.value_counts(
subset=["name"]
)
.reset_index(name="n")
.query("n < 2")
)
(
implicit_to_explicit_df
.complete(
"name",
"time",
)
)
(
implicit_to_explicit_df
# pyjanitor
.complete(
{"name": ["lynn", "zelda"]},
{"time": ["morning", "afternoon"]},
sort=True
)
)
(
implicit_to_explicit_df
# pyjanitor
.complete(
"name",
"time",
fill_value=np.nan
)
)
(
implicit_to_explicit_df
# pyjanitor
.complete(
"name",
"time",
fill_value=0,
explicit=False
)
)
diabetes_df.missing.missing_variable_plot()
diabetes_df[diabetes_df.columns[1:6]] = diabetes_df[diabetes_df.columns[1:6]].replace(0, np.nan)
diabetes_df.missing.missing_variable_plot()
(
diabetes_df
.missing.sort_variables_by_missingness()
.pipe(missingno.matrix)
)
(
diabetes_df
.missing.sort_variables_by_missingness()
.sort_values(by = "blood_pressure")
.pipe(missingno.matrix)
)
(
diabetes_df
.missing.sort_variables_by_missingness()
.sort_values("insulin")
.pipe(missingno.matrix)
)
(
riskfactors_df
.isna()
.replace({
False: "Not missing",
True: "Missing"
})
.add_suffix("_NA")
.pipe(
lambda shadow_matrix: pd.concat(
[riskfactors_df, shadow_matrix],
axis="columns"
)
)
)
(
riskfactors_df
.missing
.bind_shadow_matrix(only_missing = True)
)
(
riskfactors_df
.missing
.bind_shadow_matrix(only_missing=True)
.groupby(["weight_lbs_NA"])
["age"]
.describe()
.reset_index()
)
(
riskfactors_df
.missing
.bind_shadow_matrix(only_missing=True)
.pipe(
lambda df: (
sns.displot(
data=df,
x="age",
hue="weight_lbs_NA",
kind="kde"
)
)
)
)
(
riskfactors_df
.missing
.bind_shadow_matrix(only_missing=True)
.pipe(
lambda df: (
sns.boxenplot(
data=df,
x="weight_lbs_NA",
y="age",
)
)
)
)
(
riskfactors_df
.missing
.bind_shadow_matrix(only_missing=True)
.pipe(
lambda df: (
sns.displot(
data=df,
x="age",
col="weight_lbs_NA",
facet_kws={
"sharey": False
}
)
)
)
)
(
riskfactors_df
.missing
.bind_shadow_matrix(only_missing=True)
.pipe(
lambda df: (
sns.displot(
data=df,
x="age",
col="marital_NA",
row="weight_lbs_NA"
)
)
)
)
def column_fill_with_dummies(
column: pd.Series,
proportion_below: float=0.10,
jitter: float=0.075,
seed: int=42
) -> pd.Series:
column = column.copy(deep=True)
# Extract values metadata.
missing_mask = column.isna()
number_missing_values = missing_mask.sum()
column_range = column.max() - column.min()
# Shift data
column_shift = column.min() - column.min() * proportion_below
# Create the "jitter" (noise) to be added around the points.
np.random.seed(seed)
column_jitter = (np.random.rand(number_missing_values) - 2) * column_range * jitter
# Save new dummy data.
column[missing_mask] = column_shift + column_jitter
return column
plt.figure(figsize=(10, 10))
(
riskfactors_df
.select_dtypes(
exclude="category"
)
.pipe(
lambda df: df[df.columns[df.isna().any()]]
)
.missing.bind_shadow_matrix(true_string=True, false_string=False)
.apply(
lambda column: column if "_NA" in column.name else column_fill_with_dummies(column, proportion_below=0.05, jitter=0.075)
)
.assign(
nullity=lambda df: df.weight_lbs_NA | df.height_inch_NA
)
.pipe(
lambda df: (
sns.scatterplot(
data=df,
x="weight_lbs",
y="height_inch",
hue="nullity"
)
)
)
)
missingno.heatmap(
df=riskfactors_df
)
missingno.dendrogram(
df=riskfactors_df
)
riskfactors_df.shape
(
riskfactors_df
.weight_lbs
.mean()
)
riskfactors_df.weight_lbs.size, riskfactors_df.weight_lbs.count()
riskfactors_df.weight_lbs.mean(skipna=False)
(
riskfactors_df
.dropna(
subset=["weight_lbs"],
how="any"
)
.shape
)
(
riskfactors_df
.dropna(
subset=["weight_lbs", "height_inch"],
how="any"
)
.shape
)
(
riskfactors_df
.dropna(
subset=["weight_lbs", "height_inch"],
how="all"
)
.shape
)
(
riskfactors_df
.dropna(
subset=["weight_lbs", "height_inch"],
how="any"
)
.select_columns(["weight_lbs", "height_inch"])
.pipe(
lambda df: missingno.matrix(df)
)
)
(
riskfactors_df
.dropna(
subset=["weight_lbs", "height_inch"],
how="all"
)
.select_columns(["weight_lbs", "height_inch"])
.pipe(
lambda df: missingno.matrix(df)
)
)
implicit_to_explicit_df = pd.DataFrame(
data={
"name": ["lynn", np.nan, "zelda", np.nan, "shadowsong", np.nan],
"time": ["morning", "afternoon", "morning", "afternoon", "morning", "afternoon",],
"value": [350, 310, 320, 350, 310, 320]
}
)
implicit_to_explicit_df
implicit_to_explicit_df.ffill()
plt.figure(figsize=(10, 10))
(
riskfactors_df
.select_columns("weight_lbs", "height_inch", "bmi")
.missing.bind_shadow_matrix(true_string=True, false_string=False)
.apply(
axis="rows",
func=lambda column: column.fillna(column.mean()) if "_NA" not in column.name else column
)
.pipe(
lambda df: (
sns.displot(
data=df,
x="weight_lbs",
hue="weight_lbs_NA"
)
)
)
)
plt.figure(figsize=(10, 10))
(
riskfactors_df
.select_columns("weight_lbs", "height_inch", "bmi")
.missing.bind_shadow_matrix(true_string=True, false_string=False)
.apply(
axis="rows",
func=lambda column: column.fillna(column.mean()) if "_NA" not in column.name else column
)
.assign(
imputed=lambda df: df.weight_lbs_NA | df.height_inch_NA
)
.pipe(
lambda df: (
sns.scatterplot(
data=df,
x="weight_lbs",
y="height_inch",
hue="imputed"
)
)
)
)
plt.figure(figsize=(10, 10))
(
riskfactors_df
.select_columns("weight_lbs", "height_inch", "bmi")
.missing.bind_shadow_matrix(true_string=True, false_string=False)
.apply(
axis="rows",
func=lambda column: column.fillna(column.mean())
if "_NA" not in column.name
else column,
)
.pivot_longer(
index="*_NA"
)
.pivot_longer(
index=["variable", 'value'],
names_to="variable_NA",
values_to="value_NA"
)
.assign(
valid=lambda df: df.apply(axis="columns", func=lambda column: column.variable in column.variable_NA)
)
.query("valid")
.pipe(
lambda df: (
sns.displot(
data=df,
x="value",
hue="value_NA",
col="variable",
common_bins=False,
facet_kws={
"sharex": False,
"sharey": False
}
)
)
)
)
session_info.show()