import janitor
import matplotlib.pyplot as plt
import missingno
import numpy as np
import pandas as pd
import pyreadr
import seaborn as sns
import session_info
import upsetplot
%matplotlib inline
sns.set(
rc={
"figure.figsize": (10, 10)
}
)
sns.set_style("whitegrid")
base_url = "https://github.com/njtierney/naniar/raw/master/data/"
dataset_names = ("oceanbuoys", "pedestrian", "riskfactors")
extension = ".rda"
oceanbuoys_df.shape, pedestrian_df.shape, riskfactors_df.shape, diabetes_df.shape
riskfactors_df.info()
riskfactors_df.missing.number_complete()
riskfactors_df.missing.number_missing()
riskfactors_df.missing.missing_case_summary()
riskfactors_df.missing.missing_case_table()
(
riskfactors_df
.missing
.missing_variable_span(
variable="weight_lbs", # variable a analizar
span_every=50 # intervalos a romper la variable
)
)
(
riskfactors_df
.missing
.missing_variable_run(
variable="weight_lbs"
)
)
riskfactors_df.missing.missing_variable_plot()
riskfactors_df.missing.missing_case_plot()
(
riskfactors_df
.missing
.missing_variable_span_plot(
variable="weight_lbs",
span_every=10,
rot = 0 # label rotation
)
)
missingno.bar(df = riskfactors_df);
missingno.matrix(df=riskfactors_df);
(
riskfactors_df
.missing
.missing_upsetplot(
variables=["pregnant","weight_lbs","smoke_stop"], # Put the target variable name
# alternatively, put none to see all variables
element_size = 60 # This one specifies the size of the figure that you want to create
)
);
common_na_strings = (
"missing",
"NA",
"N A",
"N/A",
"#N/A",
"NA ",
" NA",
"N /A",
"N / A",
" N / A",
"N / A ",
"na",
"n a",
"n/a",
"na ",
" na",
"n /a",
"n / a",
" a / a",
"n / a ",
"NULL",
"null",
"",
"?",
"*",
".",
)
common_na_numbers = (-9, -99, -999, -9999, 9999, 66, 77, 88, -1)
missing_data_example_df = pd.DataFrame.from_dict(
dict(
x = [1, 3, "NA", -99, -98, -99],
y = ["A", "N/A", "NA", "E", "F", "G"],
z = [-100, -99, -98, -101, -1, -1]
)
)
missing_data_example_df
missing_data_example_df.isna()
pd.read_csv(
"./data/missing_data_enconding_example.csv",
na_filter=True,
na_values=[-99, -1]
)
(
missing_data_example_df
.replace(
to_replace=[-99, "NA"],
value=np.nan
)
)
(
missing_data_example_df
.replace(
to_replace={
"x":{-99:np.nan}
}
)
)
implicit_to_explicit_df = pd.DataFrame.from_dict(
data={
"name": ["lynn", "lynn", "lynn", "zelda"],
"time": ["morning", "afternoon", "night", "morning"],
"value": [350, 310, np.nan, 320]
}
)
implicit_to_explicit_df
(
implicit_to_explicit_df
.pivot_wider(
index="name",
names_from="time",
values_from="value"
)
)
(
implicit_to_explicit_df
.value_counts(
subset=["name"]
)
.reset_index(name="n")
.query("n<3")
)
(
implicit_to_explicit_df
#janitor
.complete(
"name",
"time",
fill_value = np.nan #aqui se puede poner cualquier valor para los nulos
)
)
(
implicit_to_explicit_df
#janitor
.complete(
"name",
"time",
fill_value = 0,
explicit=False
)
)
diabetes_df.missing.missing_variable_plot()
diabetes_df[diabetes_df.columns[1:6]] = diabetes_df[diabetes_df.columns[1:6]].replace(0, np.nan)
diabetes_df.missing.missing_variable_plot()
(
diabetes_df
.missing.sort_variables_by_missingness()
.pipe(missingno.matrix)
)
(
diabetes_df
.missing.sort_variables_by_missingness()
.sort_values(by="blood_pressure")
.pipe(missingno.matrix)
)
(
diabetes_df
.missing.sort_variables_by_missingness()
.sort_values(by="insulin")
.pipe(missingno.matrix)
)
(
riskfactors_df
.isna()
.replace({
False:"Not missing",
True:"Missing"
})
.add_suffix("_NA")
.pipe(
lambda shadow_matrix: pd.concat(
[riskfactors_df, shadow_matrix],
axis="columns"
)
)
)
(
riskfactors_df
.missing
.bind_shadow_matrix(only_missing=True) #Only missing=true will just concat cols with missing values
)
(
riskfactors_df
.missing
.bind_shadow_matrix(only_missing=True)
.groupby(["weight_lbs_NA"])
["age"]
.describe()
.reset_index()
)
(
riskfactors_df
.missing
.bind_shadow_matrix(only_missing=True)
.pipe(
lambda df:(
sns.boxenplot(
data=df,
x="weight_lbs_NA",
y="age"
)
)
)
)
(
riskfactors_df
.missing
.bind_shadow_matrix(only_missing=True)
.pipe(
lambda df:(
sns.displot(
data=df,
x="age",
hue="weight_lbs_NA",
kind="kde"
)
)
)
)
(
riskfactors_df
.missing
.bind_shadow_matrix(only_missing=True)
.pipe(
lambda df:(
sns.displot(
data=df,
x="age",
col="weight_lbs_NA",
facet_kws={
"sharey":False
}
)
)
)
)
(
riskfactors_df
.missing
.bind_shadow_matrix(only_missing=True)
.pipe(
lambda df:(
sns.displot(
data=df,
x="age",
col="marital_NA",
row="weight_lbs_NA"
)
)
)
)
def column_fill_with_dummies(
column: pd.Series,
proportion_below: float=0.10,
jitter: float=0.075,
seed: int=42
) -> pd.Series:
column = column.copy(deep=True)
# Extract values metadata
missing_mask = column.isna()
number_missing_values = missing_mask.sum()
column_range = column.max() - column.min()
# Shift data
column_shift = column.min() - column.min() * proportion_below
# Create the "jitter" (noise) to be added around the points.
np.random.seed(seed)
column_jitter = (np.random.rand(number_missing_values) - 2) * column_range * jitter
# Save new dummy data.
column[missing_mask] = column_shift + column_jitter
return column
(
riskfactors_df
.select_dtypes(
exclude="category"
)
.pipe(
lambda df: df[df.columns[df.isna().any()]]
)
.missing.bind_shadow_matrix(true_string = True, false_string = False)
.apply(
lambda column: column if "_NA" in column.name else column_fill_with_dummies(column, proportion_below=0.05, jitter = 0.075)
)
.assign(
nullity = lambda df: df.weight_lbs_NA | df.height_inch_NA
)
.pipe(
lambda df: (
sns.scatterplot(
data = df,
x = "weight_lbs",
y = "height_inch",
hue = "nullity"
)
)
)
)
missingno.dendrogram(
df=riskfactors_df
)
riskfactors_df.shape
riskfactors_df.weight_lbs.size, riskfactors_df.weight_lbs.count()
(
riskfactors_df
.dropna(
subset=["weight_lbs", "height_inch"],
how="all"
)
)
(
riskfactors_df
.dropna(
subset=["weight_lbs", "height_inch"],
how="any"
)
.select_columns(["weight_lbs", "height_inch"])
.pipe(missingno.matrix)
)
implicit_to_explicit_df = pd.DataFrame(
data={
"name": ["lynn", np.nan, "zelda", np.nan, "shadowsong", np.nan],
"time": ["morning", "afternoon", "morning", "afternoon", "morning", "afternoon",],
"value": [350, 310, 320, 350, 310, 320]
}
)
implicit_to_explicit_df
(
riskfactors_df
.select_columns("weight_lbs", "height_inch", "bmi")
.missing.bind_shadow_matrix(true_string = True, false_string = False)
.apply(
axis = "rows",
func = lambda column : column.fillna(column.mean()) if "_NA" not in column.name else column
# Basicamente se acaba de llenar el valor faltante con el promedio de su respectiva columna
)
)
(
riskfactors_df
.select_columns("weight_lbs", "height_inch", "bmi")
.missing.bind_shadow_matrix(true_string = True, false_string = False)
.apply(
axis = "rows",
func = lambda column : column.fillna(column.mean()) if "_NA" not in column.name else column
# Basicamente se acaba de llenar el valor faltante con el promedio de su respectiva columna
)
.pipe(
lambda df: (
sns.displot(
data=df,
x="weight_lbs",
hue="weight_lbs_NA"
)
)
)
)