import empiricaldist
import janitor
import matplotlib.pyplot as plt
import numpy as np
import palmerpenguins
import pandas as pd
import scipy.stats
import seaborn as sns
import sklearn.metrics
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats as ss
%matplotlib inline
sns.set_style(style='whitegrid')
sns.set_context(context='notebook')
plt.rcParams['figure.figsize'] = (11, 9.4)
penguin_color = {
'Adelie': '#ff6602ff',
'Gentoo': '#0f7175ff',
'Chinstrap': '#c65dc9ff'
}
raw_penguins_df = palmerpenguins.load_penguins_raw()
raw_penguins_df
preprocessed_penguins_df = palmerpenguins.load_penguins()
preprocessed_penguins_df
preprocessed_penguins_df = sns.load_dataset("penguins")
raw_penguins_df2 = pd.read_csv('penguins_raw.csv')
raw_penguins_df2
preprocessed_penguins_df2 = pd.read_csv('penguins.csv')
preprocessed_penguins_df2
preprocessed_penguins_df.dtypes
(
preprocessed_penguins_df
.dtypes
.value_counts()
)
preprocessed_penguins_df.shape
(
preprocessed_penguins_df
.isnull()
.any()
)
(
preprocessed_penguins_df
.isnull()
.sum()
.sort_values(ascending=False)
)
(
preprocessed_penguins_df
.isnull()
.sum()
.sum()
)
(
preprocessed_penguins_df
.isnull()
.melt(value_name='missing')
.pipe(
lambda df: (
sns.displot(
data=df,
y='variable',
hue='missing',
multiple='fill',
aspect=2
)
)
)
)
(
preprocessed_penguins_df
.isnull()
.transpose()
.pipe(
lambda df: sns.heatmap(data=df)
)
)
processed_penguins_df = (
preprocessed_penguins_df
.dropna()
)
processed_penguins_df
processed_penguins_df.describe(include='all')
processed_penguins_df.describe(include=[np.number])
processed_penguins_df.describe(include=object)
(
processed_penguins_df
.astype(
{
'species': 'category',
'island': 'category',
'sex': 'category'
}
)
.describe(include=['category', object])
)
(
processed_penguins_df
.species
.value_counts()
.plot(
kind='bar',
# color=penguin_color.values()
)
)
sns.catplot(
data=processed_penguins_df,
x='species',
kind='count',
palette=penguin_color,
# order=processed_penguins_df.value_counts('species', sort=True).index
)
(
processed_penguins_df
.value_counts('species', sort=True)
.reset_index(name='count')
.pipe(
lambda df: (
sns.barplot(
data=df,
x='species',
y='count',
palette=penguin_color
)
)
)
)
(
processed_penguins_df
.add_column('variable', '')
.pipe(
lambda df: (
sns.displot(
data=df,
x='variable',
hue='species',
multiple='fill',
palette=penguin_color
)
)
)
)
processed_penguins_df.bill_depth_mm.mean()
np.mean(processed_penguins_df.bill_depth_mm)
processed_penguins_df.mean()
processed_penguins_df.median()
processed_penguins_df.mode()
processed_penguins_df.describe(include=object)
np.mean(processed_penguins_df.bill_depth_mm)
processed_penguins_df.mean()
processed_penguins_df.median()
processed_penguins_df.mode()
processed_penguins_df.describe(include=object)
processed_penguins_df.max()
processed_penguins_df.min()
processed_penguins_df.max(numeric_only=True) - processed_penguins_df.min(numeric_only=True)
processed_penguins_df.std()
processed_penguins_df.quantile(0.25)
processed_penguins_df.quantile(0.75) - processed_penguins_df.quantile(0.25)
(
processed_penguins_df
.quantile(q=[0.75, 0.50, 0.25])
.transpose()
.rename_axis('metric')
.reset_index()
.assign(
iqr = lambda df: df[0.75] - df[0.25]
)
)
sns.histplot(
data=processed_penguins_df,
x='flipper_length_mm',
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.mean(),
color='red',
linestyle='dashed',
linewidth=2
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.median(),
color='blue',
linestyle='dashed',
linewidth=2
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.mode().values[0],
color='black',
linestyle='dashed',
linewidth=4
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.quantile(0.25),
color='yellow',
linestyle='dashed',
linewidth=2
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.quantile(0.75),
color='yellow',
linestyle='dashed',
linewidth=2
)
sns.boxplot(
x=processed_penguins_df.flipper_length_mm,
)
def freedman_diaconis_bindwidth(x: pd.Series) -> float:
"""Find optimal bindwidth using Freedman-Diaconis rule."""
IQR = x.quantile(0.75) - x.quantile(0.25)
N = x.size
return 2 * IQR / N ** (1 / 3)
sns.histplot(
data=processed_penguins_df,
x='flipper_length_mm',
binwidth=6.3
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.mean(),
color='red',
linestyle='dashed',
linewidth=2
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.median(),
color='blue',
linestyle='dashed',
linewidth=2
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.mode().values[0],
color='black',
linestyle='dashed',
linewidth=4
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.quantile(0.25),
color='yellow',
linestyle='dashed',
linewidth=2
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.quantile(0.75),
color='yellow',
linestyle='dashed',
linewidth=2
)
sns.histplot(
data=processed_penguins_df,
x='flipper_length_mm',
binwidth=1,
stat='probability'
)
pmf_flipper_length_mm = empiricaldist.Pmf.from_seq(
processed_penguins_df.flipper_length_mm,
normalize=True
)
pmf_flipper_length_mm.bar()
pmf_flipper_length_mm(231)
processed_penguins_df.flipper_length_mm.max()
sns.ecdfplot(
data=processed_penguins_df,
x="flipper_length_mm"
)
cdf_flipper_length_mm = empiricaldist.Cdf.from_seq(
processed_penguins_df.flipper_length_mm,
normalize=True
)
cdf_flipper_length_mm.plot()
q = 200 # Specify quantity
p = cdf_flipper_length_mm.forward(q)
plt.vlines(
x=q,
ymin=0,
ymax=p,
color = 'black',
linestyle='dashed'
)
plt.hlines(
y=p,
xmin=pmf_flipper_length_mm.qs[0],
xmax=q,
color='black',
linestyle='dashed'
)
plt.plot(q, p, 'ro')
cdf_flipper_length_mm.step()
p_1 = 0.25 # Specify probability
p_2 = 0.75
ps = (0.25, 0.75) # IQR
qs = cdf_flipper_length_mm.inverse(ps)
plt.vlines(
x=qs,
ymin=0,
ymax=ps,
color = 'black',
linestyle='dashed'
)
plt.hlines(
y=ps,
xmin=pmf_flipper_length_mm.qs[0],
xmax=qs,
color='black',
linestyle='dashed'
)
plt.scatter(
x=qs,
y=ps,
color='red',
zorder=2
)
sns.ecdfplot(
data=processed_penguins_df,
x='flipper_length_mm',
hue='species',
palette=penguin_color
)
sns.kdeplot(
data=processed_penguins_df,
x='flipper_length_mm',
bw_method=0.1
)
stats = processed_penguins_df.body_mass_g.describe()
stats
np.random.seed(42)
xs = np.linspace(stats['min'], stats['max'])
ys = scipy.stats.norm(stats['mean'], stats['std']).cdf(xs)
plt.plot(xs, ys, color='black', linestyle='dashed')
empiricaldist.Cdf.from_seq(
processed_penguins_df.body_mass_g,
normalize=True
).plot()
xs = np.linspace(stats['min']-1000, stats['max'] + 1000)
ys = scipy.stats.norm(stats['mean'], stats['std']).pdf(xs)
plt.plot(xs, ys, color='black', linestyle='dashed')
sns.kdeplot(
data=processed_penguins_df,
x='body_mass_g'
)
dice = empiricaldist.Pmf.from_seq([1, 2, 3, 4, 5, 6])
dice.bar()
for sample_size in (1e2, 1e3, 1e4):
sample_size = int(sample_size)
values = dice.sample(sample_size)
sample_pmf = empiricaldist.Pmf.from_seq(values)
plt.figure(figsize=(5,5))
sample_pmf.bar()
plt.axhline(y=1/6, color = 'red', linestyle='dashed')
plt.ylim([0, 0.50])
plt.title(f"Sample size: {sample_size}")
processed_penguins_df.sex.value_counts(normalize=True)
sex_numeric = processed_penguins_df.sex.replace(['Male', 'Female'], [1, 0])
number_samples = 1000
sample_size=35
samples_df = pd.DataFrame()
np.random.seed(42)
for i in range(1, number_samples + 1):
sex_numeric_sample = sex_numeric.sample(sample_size, replace=True).to_numpy()
sample_name = f"sample_{i}"
samples_df[sample_name] = sex_numeric_sample
male_population_mean = samples_df.mean().mean()
print(f"Estimated percentage of male penguins in population is: {male_population_mean * 100:.4f}%")
sample_means_binomial = pd.DataFrame(samples_df.mean(), columns=['sample_mean'])
sns.kdeplot(data=sample_means_binomial)
plt.axvline(x=sex_numeric.mean(), color='red', linestyle='dashed')
sample_size_experiment = pd.DataFrame(
[[i, samples_df.iloc[:, 0:i].mean().mean().mean()] for i in range(1, number_samples + 1)],
columns=['sample_size', 'estimated_mean']
)
sns.scatterplot(
data=sample_size_experiment,
x='sample_size',
y='estimated_mean'
)
plt.axhline(
y=sex_numeric.mean(),
color='red',
linestyle='dashed'
)
plt.ylim([sex_numeric.mean() - 0.20, sex_numeric.mean() + 0.20])
sns.scatterplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
alpha=1/2,
s=100
)
sns.displot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
rug=True
)
sns.displot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
kind='kde',
rug=True
)
sns.jointplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
marginal_kws=dict(bins=25, fill=False)
)
sns.scatterplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
hue='species',
palette=penguin_color
)
sns.stripplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
palette=penguin_color
)
ax = sns.boxplot(
data=processed_penguins_df,
x='flipper_length_mm',
y='species',
palette=penguin_color,
whis=np.inf
)
ax = sns.stripplot(
data=processed_penguins_df,
x='flipper_length_mm',
y='species',
color='.3'
)
ax = sns.violinplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
color='.8'
)
ax = sns.stripplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
palette=penguin_color
)
sns.swarmplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
hue='species',
palette=penguin_color
)
processed_penguins_df.corr()
sns.heatmap(
data=processed_penguins_df.corr(),
cmap=sns.diverging_palette(20, 230, as_cmap=True),
center=0,
vmin=-1,
vmax=1,
linewidths=0.5,
cbar_kws={"shrink": 0.5},
annot=True
)
sns.clustermap(
data=processed_penguins_df.corr(),
cmap= sns.diverging_palette(20, 230, as_cmap=True), # 'BrBG'
center=0,
vmin=-1,
vmax=1,
linewidths=0.5,
cbar_kws={"shrink": 0.5},
annot=True
)
processed_penguins_df = (
processed_penguins_df
.assign(
numeric_sex=lambda df: df.sex.replace(['Female', 'Male'], [0, 1])
)
)
sns.clustermap(
data=processed_penguins_df.corr(),
cmap= sns.diverging_palette(20, 230, as_cmap=True), # 'BrBG'
center=0,
vmin=-1,
vmax=1,
linewidths=0.5,
cbar_kws={"shrink": 0.5},
annot=True
)
x = np.linspace(-100, 100, 100)
y = x ** 2
y += np.random.normal(0, 1000, x.size)
sns.scatterplot(x=x, y=y)
np.corrcoef(x, y)
x = np.linspace(-100, 100, 100)
y = x ** 3
y += np.random.normal(0, 1000, x.size)
sns.scatterplot(x=x, y=y)
np.corrcoef(x, y)
sns.scatterplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm'
)
np.random.seed(42)
x_1 = np.linspace(0, 100, 100)
y_1 = 0.1 * x_1 + 3 + np.random.uniform(-2, 2, size=x_1.size)
sns.scatterplot(
x=x_1,
y=y_1
)
x_2 = np.linspace(0, 100, 100)
y_2 = 0.5 * x_2 + 1 + np.random.uniform(0, 60, size=x_2.size)
sns.scatterplot(
x=x_2,
y=y_2
)
plt.legend(labels=['1', '2'])
print(np.corrcoef(x_1, y_1))
print(np.corrcoef(x_2, y_2))
res_1 = scipy.stats.linregress(x=x_1, y=y_1)
res_2 = scipy.stats.linregress(x=x_2, y=y_2)
print(res_1, res_2, sep="\n")
sns.scatterplot(
x=x_1,
y=y_1
)
fx_1 = np.array([x_1.min(), x_1.max()])
fy_1 = res_1.intercept + res_1.slope * fx_1
plt.plot(fx_1, fy_1)
sns.scatterplot(
x=x_2,
y=y_2
)
fx_2 = np.array([x_2.min(), x_2.max()])
fy_2 = res_2.intercept + res_2.slope * fx_2
plt.plot(fx_2, fy_2)
plt.legend(labels=['1', '1', '2', '2'])
sns.scatterplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm'
)
res_penguins = scipy.stats.linregress(x=processed_penguins_df.bill_length_mm, y=processed_penguins_df.bill_depth_mm)
print(res_penguins)
fx_1 = np.array([processed_penguins_df.bill_length_mm.min(), processed_penguins_df.bill_length_mm.max()])
fy_1 = res_penguins.intercept + res_penguins.slope * fx_1
plt.plot(fx_1, fy_1)
sns.lmplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
height=10
)
x = processed_penguins_df.bill_length_mm
y = processed_penguins_df.bill_depth_mm
res_x_y = scipy.stats.linregress(x=x, y=y)
res_y_x = scipy.stats.linregress(y=x, x=y)
print(res_x_y, res_y_x, sep="\n")
sns.scatterplot(
x=x,
y=y
)
fx_1 = np.array([x.min(), x.max()])
fy_1 = res_x_y.intercept + res_x_y.slope * fx_1
plt.plot(fx_1, fy_1)
sns.scatterplot(
x=y,
y=x
)
fx_1 = np.array([y.min(), y.max()])
fy_1 = res_y_x.intercept + res_y_x.slope * fx_1
plt.plot(fx_1, fy_1)
(
smf.ols(
formula="bill_length_mm ~ bill_depth_mm",
data=processed_penguins_df
)
.fit()
.params
)
(
smf.ols(
formula="bill_depth_mm ~ bill_length_mm",
data=processed_penguins_df
)
.fit()
.summary()
)
model_1 = (
smf.ols(
formula="body_mass_g ~ bill_length_mm",
data=processed_penguins_df
)
.fit()
)
model_1.summary()
model_2 = (
smf.ols(
formula="body_mass_g ~ bill_length_mm + bill_depth_mm ",
data=processed_penguins_df
)
.fit()
)
model_2.summary()
model_3 = (
smf.ols(
formula="body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm",
data=processed_penguins_df
)
.fit()
)
model_3.summary()
model_4 = (
smf.ols(
formula="body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm + C(sex)",
data=processed_penguins_df
)
.fit()
)
model_4.summary()
model_5 = (
smf.ols(
formula="body_mass_g ~ flipper_length_mm + C(sex)",
data=processed_penguins_df
)
.fit()
)
model_5.summary()
models_results = pd.DataFrame(
dict(
actual_value=processed_penguins_df.body_mass_g,
prediction_model_1 = model_1.predict(),
prediction_model_2 = model_2.predict(),
prediction_model_3 = model_3.predict(),
prediction_model_4 = model_4.predict(),
prediction_model_5 = model_5.predict(),
species=processed_penguins_df.species,
sex=processed_penguins_df.sex
)
)
models_results
sns.ecdfplot(
data=models_results#.select_columns(['actual_value', 'prediction_model_5'])
)
sns.kdeplot(
data=models_results,
cumulative=False
)
sns.lmplot(
data=processed_penguins_df,
x='flipper_length_mm',
y='body_mass_g',
height=10,
hue='sex'
)
smf.logit(
formula='sex_numeric ~ flipper_length_mm + bill_length_mm + bill_depth_mm + C(island)',
data=processed_penguins_df
).fit().summary()
(
processed_penguins_df
.value_counts(['island', 'sex', 'species'])
.reset_index(name='count')
)
processed_penguins_df.species.unique()
processed_penguins_df = (
processed_penguins_df
.assign(is_adelie=lambda df: df.species.replace(['Adelie', 'Chinstrap', 'Gentoo'], [1, 0, 0]))
)
model_is_adelie = smf.logit(
formula='is_adelie ~ flipper_length_mm + C(sex)',
data=processed_penguins_df
).fit(maxiter=100)
model_is_adelie.params
is_adelie_df_predictions = pd.DataFrame(
dict(
actual_adelie = processed_penguins_df.species.replace(['Adelie', 'Chinstrap', 'Gentoo'], [1, 0, 0]),
predicted_values = model_is_adelie.predict().round()
)
)
is_adelie_df_predictions
(
is_adelie_df_predictions
.value_counts(['actual_adelie', 'predicted_values'])
.reset_index(name='count')
.pivot_wider(
index='actual_adelie',
names_from='predicted_values',
values_from='count'
)
.rename_column('actual_adelie', 'actual / predicted')
)
print(
sklearn.metrics.confusion_matrix(
is_adelie_df_predictions.actual_adelie,
is_adelie_df_predictions.predicted_values
)
)
sklearn.metrics.accuracy_score(
is_adelie_df_predictions.actual_adelie,
is_adelie_df_predictions.predicted_values
)
sns.scatterplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm'
)
sns.regplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm'
)
sns.lmplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
hue='species',
height=10,
palette=penguin_color
)
sns.pairplot(data=processed_penguins_df, hue='species', palette=penguin_color)