f = lambda k, l: (k + len(l)) * 47 % 11 + 1 #lambda function to calculate task parameters and to choose our representative
print(f(30, 'Perina'))
print(f(30, 'Nezdara'))
print(f(3, 'Kondac'))
10
2
6
import numpy as np
import pandas as pd
from plotly import graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
np.set_printoptions(precision=5, suppress=True)
df = pd.read_csv('./df.csv', index_col=False, sep=';', decimal=',', dtype={'Weight': float, 'Status': str}) #load the original dataset
df.head()
"""
split the original dataset into two subsets based on sparrows status, then drop the 'Status' column
"""
survived = df[df['Status'] == 'survived'].copy()
survived.drop(columns=['Status'], inplace=True)
perished = df[df['Status'] == 'perished'].copy()
perished.drop(columns=['Status'], inplace=True)
df.describe(include='all')
survived.describe(include='all')
perished.describe(include='all')
def mean_estimate(values) -> float:
"""
function to calculate the mean, given a list of values
"""
return sum(values) / len(values)
survived_mean = mean_estimate(survived['Weight'])
perished_mean = mean_estimate(perished['Weight'])
print(f"The mean value estimate for 'Survived' class is {survived_mean:.3f} and {perished_mean:.3f} for the 'Perished' class.")
The mean value estimate for 'Survived' class is 25.463 and 26.275 for the 'Perished' class.
def variance_estimate(values) -> float:
"""
function to calculate the variance, given a list of values
"""
return sum((values-mean_estimate(values))**2 / (len(values)-1))
variance_survived = variance_estimate(survived['Weight'])
variance_perished = variance_estimate(perished['Weight'])
print(f"The variance value estimate for 'Survived' is {variance_survived:.3f} and {variance_perished:.3f} for 'Perished' class.")
The variance value estimate for 'Survived' is 1.585 and 2.168 for 'Perished' class.
def median(values):
"""
function to calculate the median value, given a list of values
"""
length = len(values)
values.sort()
return values[length // 2] if length % 2 != 0 else sum(values[length//2-1:length//2+1])/2
median_survived = median(survived['Weight'].tolist())
median_perished = median(perished['Weight'].tolist())
print(f'Median for survived is {median_survived:.3f} and median for perished is {median_perished:.3f}')
Median for survived is 25.700 and median for perished is 26.000
fig = px.histogram(df, x="Weight", color="Status", title="Histogram of sparrow weight", histnorm='probability density')
fig.update_layout(
xaxis_title_text='Weight (grams)', # xaxis label
yaxis_title_text='Count', # yaxis label
)
fig.show()
def edf(x, values: pd.Series) -> float: #edf - Empirical Distribution Function
"""
function that returns how many values in data are lower than x
"""
counts = values.value_counts()
return sum(counts[counts.index<=x])/len(values)
x_s = np.linspace(survived.Weight.min(), survived.Weight.max(), 1000)
y_s = [edf(x, survived.Weight) for x in x_s]
x_p = np.linspace(perished.Weight.min(), perished.Weight.max(), 1000)
y_p = [edf(x, perished.Weight) for x in x_p]
fig = make_subplots(rows=2, cols=2,
subplot_titles=[f'{y} - {x}' for y in ['Histogram', 'Empirical Distribution Function'] for x in ['survived','perished']]);
fig.append_trace(go.Histogram(x=survived['Weight'], name='Survived', histnorm='probability density'), 1, 1 );
fig.append_trace(go.Histogram(x=perished['Weight'], name='Perished', histnorm='probability density'), 1, 2 );
fig.update_xaxes(title_text="Weight (grams)", row=1, col=1);
fig.update_yaxes(title_text="Count", row=1, col=1);
fig.update_xaxes(title_text="Weight (grams)", row=1, col=2);
fig.update_yaxes(title_text="Count", row=1, col=2);
fig.add_trace(go.Scatter(x=x_s, y=y_s, name='Survived', mode='lines'), row=2, col=1);
fig.add_trace(go.Scatter(x=x_p, y=y_p, name='Perished', mode='lines'), row=2, col=2);
fig.update_xaxes(title_text="Weight", row=2, col=1);
fig.update_yaxes(title_text="P(X <= x)", row=2, col=1);
fig.update_xaxes(title_text="Weight", row=2, col=2);
fig.update_yaxes(title_text="P(X <= x)", row=2, col=2);
fig.update_layout(showlegend=False);
fig.show()
from scipy.stats import norm, uniform, expon
fig = make_subplots(2, 1, shared_xaxes=True)
for (i, y) in enumerate([('Survived', survived['Weight']), ('Perished', perished['Weight'])]):
cat, values = y
x = np.linspace(values.min()-3, values.max()+3, 1000)
loc_exp, scale_exp = expon.fit(values)
loc_norm, scale_norm = norm.fit(values)
loc_unif, scale_unif = uniform.fit(values)
# Rice Rule
nbins = int(np.round(2.*np.cbrt(df.shape[0])))
fig.append_trace(go.Histogram(x=values, nbinsx=nbins, name=cat, histnorm='probability density'), i+1, 1 )
fig.append_trace(go.Scatter(x=x, y=expon.pdf(x, loc_exp, scale_exp), mode='lines', name=f'Exp({1/scale_exp:2.1f})'), i+1, 1 )
fig.append_trace(go.Scatter(x=x, y=uniform.pdf(x, loc_unif, scale_unif), mode='lines', name=f'Unif({loc_unif:2.2f},{scale_unif:2.2f})'), i+1, 1 )
fig.append_trace(go.Scatter(x=x, y=norm.pdf(x, loc_norm, scale_norm), mode='lines', name=f'N({loc_norm:2.2f}, {scale_norm**2:2.2f})'), i+1, 1 )
fig.update_layout(title='Histogram of each group with normal, exponential and uniform distribution functions')
fig.update_xaxes(title_text="Weight")
fig.update_yaxes(title_text="Count")
fig.show()
min_of_two = lambda x,y : min(x.min(), y.min())
max_of_two = lambda x,y : max(x.max(), y.max())
def rand_val_dist(name, data, n=100):
"""
function generates 100 random values by ours distribution and then shows their histogram
"""
loc_s, scale_s = norm.fit(data['Weight'])
rs = np.random.normal(loc=loc_s, scale=scale_s, size=n)
fig = go.Figure()
x = np.linspace(min_of_two(rs, data.Weight)-1, max_of_two(rs, data.Weight)+1, 1000)
fig.add_trace(go.Histogram(x=data.Weight, nbinsx=11, name=name, histnorm='probability density'))
fig.add_trace(go.Histogram(x=rs, nbinsx=11, name='Generated', histnorm='probability density'))
fig.add_trace(go.Scatter(x=x, y=norm.pdf(x, loc_s, scale_s), mode='lines', name=f'N({loc_s:2.2f}, {scale_s**2:2.2f})') )
fig.update_layout(title="Generated samples from "+name+" distribution")
fig.update_xaxes(title_text="Weight")
fig.update_yaxes(title_text="Count")
fig.show()
rand_val_dist('Survived', survived)
rand_val_dist('Perished', perished)
confidence_level = 95
alpha = lambda x: np.round(1 - (x/100), 5)
a = alpha(confidence_level) / 2
print(f'alpha: {a}')
alpha: 0.025
Z_ah = norm.isf(a)
sigma = np.sqrt(variance_estimate(perished.Weight))
mu = mean_estimate(perished.Weight)
n_sqrt = np.sqrt(perished.Weight.shape[0])
loc_p, scale_p = norm.fit(perished.Weight)
"""
use of the pattern
"""
p_lb, p_ub = mu - Z_ah*(sigma/n_sqrt), mu + Z_ah*(sigma/n_sqrt)
print(f'({p_lb}, {p_ub})')
(25.6859170713605, 26.864082960428632)
fig = go.Figure()
x = np.linspace(perished.Weight.min()-1, perished.Weight.max()+1, 1000)
X = np.linspace(p_lb, p_ub, 1000)
fig.add_trace(go.Scatter(x=X, y=norm.pdf(X, loc_p, scale_p),fill='tozeroy', mode='lines', name=f'Confidence inteval ([{p_lb:2.3f},{p_ub:2.3f}])', marker=dict(
size=16,
color='red', #set color equal to a variable
showscale=True
)))
fig.add_trace(go.Scatter(x=x, y=norm.pdf(x, loc_p, scale_p), mode='lines', name=f'N({loc_p:2.2f}, {scale_p**2:2.2f})', marker=dict(
size=16,
color='blue', #set color equal to a variable
showscale=True
)) )
fig.add_trace(go.Scatter(x=[mu, mu], y=[0,norm.pdf(mu, loc_p, scale_p)], mode='lines', name=f'mu ({mu:2.3f})', marker=dict(
size=16,
color='lime', #set color equal to a variable
showscale=True
)) )
fig.update_layout(title="Two-tail 95% confidence interval for Perished")
fig.update_xaxes(title_text="Weight")
fig.update_yaxes(title_text="Count")
fig.show()
Z_ah = norm.isf(a)
sigma = np.sqrt(variance_estimate(survived.Weight))
mu = mean_estimate(survived.Weight)
n_sqrt = np.sqrt(survived.Weight.shape[0])
loc_s, scale_s = norm.fit(survived.Weight)
s_lb, s_ub = mu - Z_ah*(sigma/n_sqrt), mu + Z_ah*(sigma/n_sqrt)
print(f'({s_lb}, {s_ub})')
(25.045800384404625, 25.879914162888927)
fig = go.Figure()
x = np.linspace(survived.Weight.min()-1, survived.Weight.max()+1, 1000)
X = np.linspace(s_lb, s_ub, 1000)
fig.add_trace(go.Scatter(x=X, y=norm.pdf(X, loc_s, scale_s),fill='tozeroy', mode='lines', name=f'Confidence inteval ([{p_lb:2.3f},{p_ub:2.3f}])', marker=dict(
size=16,
color='red', #set color equal to a variable
showscale=True
)))
fig.add_trace(go.Scatter(x=x, y=norm.pdf(x, loc_s, scale_s), mode='lines', name=f'N({loc_p:2.2f}, {scale_p**2:2.2f})', marker=dict(
size=16,
color='blue', #set color equal to a variable
showscale=True
)) )
fig.add_trace(go.Scatter(x=[mu, mu], y=[0,norm.pdf(mu, loc_s, scale_s)], mode='lines', name=f'mu ({mu:2.3f})', marker=dict(
size=16,
color='lime', #set color equal to a variable
showscale=True
)) )
fig.update_layout(title="Two-tail 95% confidence interval for Survived")
fig.update_xaxes(title_text="Weight")
fig.update_yaxes(title_text="Count")
fig.show()
K = 30
in_range = lambda x,l,u: ((x >= l) and (x <= u))
accepted = in_range(K, p_lb, p_ub)
print(f'Category: Perished\nK:= {K} is {"not" if not accepted else ""} in the \
range of confidence interval ({p_lb:2.3f}, {p_ub:2.3f}). Thus, we {"reject" if not accepted else "accept"} the hypothesis H_0.\n')
accepted = in_range(K, s_lb, s_ub)
print(f'Category: Survived\nK:= {K} is {"not" if not accepted else ""} in the \
range of confidence interval ({s_lb:2.3f}, {s_ub:2.3f}). Thus, we {"reject" if not accepted else "accept"} the hypothesis H_0.')
Category: Perished
K:= 30 is not in the range of confidence interval (25.686, 26.864). Thus, we reject the hypothesis H_0.
Category: Survived
K:= 30 is not in the range of confidence interval (25.046, 25.880). Thus, we reject the hypothesis H_0.
alpha = 0.05
print(f'Variance of the Perished subset: {variance_perished:2.3f} and variance of the Survived subset: {variance_survived:2.3f}')
equal = variance_perished == variance_survived
tests = ["Welch's","Standard independent two sample"]
print(f'Variances are {"not" if not equal else ""} equal, thus we are going to use {tests[equal]} test for testing whether or not the two groups share the same mean value.')
Variance of the Perished subset: 2.168 and variance of the Survived subset: 1.585
Variances are not equal, thus we are going to use Welch's test for testing whether or not the two groups share the same mean value.
from scipy.stats import ttest_ind
stat, p_value = ttest_ind(survived.Weight, perished.Weight, equal_var=False)
print(f'Based on the Welch\'s t-test, we {"reject" if p_value<alpha else "accept"} the hypothesis H_0.')
Based on the Welch's t-test, we reject the hypothesis H_0.