import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import rdata
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.optimize import minimize
from scipy.stats import norm, t, chi2, expon, uniform
np.set_printoptions(precision=3)
np.random.seed(13232)
# nacteni souboru
path = './ex0211.rda'
parsed = rdata.parser.parse_file(path)
converted = rdata.conversion.convert(parsed)
data = converted['ex0211']
data
# rozdeeni do dvou skupin
ctl = data[data['Group'] == 'control'].drop(columns='Group')
bcl = data[data['Group'] == 'bacilli'].drop(columns='Group')
ctl_mean, ctl_var, ctl_median = ctl['Lifetime'].mean() , ctl['Lifetime'].var(), ctl['Lifetime'].median()
bcl_mean, bcl_var, bcl_median = bcl['Lifetime'].mean() , bcl['Lifetime'].var(), bcl['Lifetime'].median()
print(f'Control group:\nMean: {ctl_mean:.2f}\nVariance: {ctl_var:.2f}\nMedian: {ctl_median:.2f}')
print('=============')
print(f'Bacilli group:\nMean: {bcl_mean:.2f}\nVariance: {bcl_var:.2f}\nMedian: {bcl_median:.2f}')
Control group:
Mean: 345.23
Variance: 49371.67
Median: 316.50
=============
Bacilli group:
Mean: 242.53
Variance: 13907.69
Median: 214.50
def plot_distr(arr, name):
plt.figure(figsize=(16, 8))
plt.subplot(121)
plt.title(f'Hustota pravdepodobnosti skupina {name}')
sns.distplot(arr, hist=True, kde=True, bins=15);
plt.subplot(122)
plt.title(f'Distribucni funkce skupina {name}')
ecdf = ECDF(arr)
plt.plot(ecdf.x, ecdf.y)
plt.xlabel('Lifetime')
plt.ylabel('Pravdepodobnost')
plt.grid()
plt.show()
# Skupina "BAcili"
plot_distr(bcl.values.reshape(1, -1)[0], 'Bacilli')
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
plot_distr(ctl.values.reshape(1, -1)[0], 'Control')
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
def get_params(loc, scale, distr):
if distr == 'exp':
return 1 / scale
elif distr == 'unif':
a = loc
b = loc + scale
return a, b
elif distr == 'norm':
return loc, scale
def fit_distributions(val, name):
# exp distribution Exp(lambda)
exp_loc, exp_scale = expon.fit(val, floc=0)
lmbd = get_params(exp_loc, exp_scale, 'exp')
print(f'MLE odhad {name} ~ Exp(lambda={lmbd:.4f})')
# normal distribution N(m, s)
norm_loc, norm_scale = norm.fit(val)
print(f'MLE odhad {name} ~ N(mean={norm_loc:.4f}, std={norm_scale})')
# uniform distribution
unif_loc, unif_scale = uniform.fit(val)
a, b = get_params(unif_loc, unif_scale, 'unif')
print(f'MLE odhad {name} ~ Unif({a}, {b})')
# get best distribution
distrs = ['norm', 'exp', 'unif']
dic = {
'norm': {
'function': norm,
'loc': norm_loc,
'scale': norm_scale,
},
'exp': {
'function': expon,
'loc': exp_loc,
'scale': exp_scale,
},
'unif': {
'function': uniform,
'loc': unif_loc,
'scale': unif_scale,
}
}
res = []
for d in distrs:
densities = dic[d]['function'].pdf(val, loc=dic[d]['loc'], scale=dic[d]['scale'])
res.append(np.prod(densities))
best_d = distrs[np.argmax(res)]
print(f'Nejlepsi odhad - {best_d}{get_params(dic[best_d]["loc"], dic[best_d]["scale"], best_d)} rozdeleni')
plt.figure(figsize=(12, 8))
x = np.linspace(min(val), max(val), 1000)
plt.hist(val, bins=20, density=True)
plt.plot(x, expon.pdf(x, loc=exp_loc, scale=exp_scale), label='Exponential distr', linewidth=3)
plt.plot(x, norm.pdf(x, loc=norm_loc, scale=norm_scale), label='Normal distr', linewidth=3)
plt.plot(x, uniform.pdf(x, loc=unif_loc, scale=unif_scale), label='Uniform distr', linewidth=3)
plt.legend()
plt.title(f'{name} group')
plt.show()
fit_distributions(bcl.values.reshape(1, -1)[0], 'bacilli')
MLE odhad bacilli ~ Exp(lambda=0.0041)
MLE odhad bacilli ~ N(mean=242.5345, std=116.90981132203213)
MLE odhad bacilli ~ Unif(76.0, 598.0)
Nejlepsi odhad - norm(242.5344827586207, 116.90981132203213) rozdeleni
fit_distributions(ctl.values.reshape(1, -1)[0], 'control')
MLE odhad control ~ Exp(lambda=0.0029)
MLE odhad control ~ N(mean=345.2344, std=220.4546255884856)
MLE odhad control ~ Unif(18.0, 735.0)
Nejlepsi odhad - unif(18.0, 735.0) rozdeleni
# bacilli
samples = norm.rvs(loc=242.5345, scale=116.9098, size=100, random_state=42)
plt.figure(figsize=(10, 6))
plt.hist(samples, alpha=0.7, label='generated', density=True)
plt.hist(bcl.values, alpha=0.7, label='real', density=True)
plt.legend()
plt.show()
# control
samples = uniform.rvs(loc=18, scale=717, size=100, random_state=42)
plt.figure(figsize=(10, 6))
plt.hist(samples, alpha=0.7, label='generated', density=True)
plt.hist(ctl.values, alpha=0.7, label='real', density=True)
plt.legend()
# Method, implements sequence of actions to calculate 2 sided CI with distribution t
def calculate_full_ci(data, target, group_name, alpha, logs=True):
if target != None:
X = data[target]
else:
X = data
n = len(data)
Xn = sum(X) / n
s2 = sum((X - Xn) ** 2) / (n - 1)
s = np.sqrt(s2)
krit_hodnota = t.isf(alpha/2, df=n-1)
delta = krit_hodnota * s / np.sqrt(n)
IS = np.array([Xn - delta, Xn + delta])
if logs == True:
print('\nSkupina {0}:'.format(group_name))
print('{0}% kritická hodnota rozdělení t:\nt({1})={2:.4f}'
.format(100*alpha/2, alpha/2, krit_hodnota))
print('Oboustranný IS: ', IS)
return IS
# Skupina "Control"
IS_c = calculate_full_ci(ctl, 'Lifetime', 'Control', 0.05, logs=True)
# Skupina "Bacilli"
IS_b = calculate_full_ci(bcl, 'Lifetime', 'Bacilli', 0.05, logs=True)
Skupina Control:
2.5% kritická hodnota rozdělení t:
t(0.025)=1.9983
Oboustranný IS: [289.731 400.738]
Skupina Bacilli:
2.5% kritická hodnota rozdělení t:
t(0.025)=2.0025
Oboustranný IS: [211.526 273.543]
def check_hypotesis(k, IS):
res = k < IS[0] or k > IS[1]
if res == True:
print('Hypotezu odmitame.')
else:
print('Hypotezu neodmitame.')
alpha = 0.05
K = 20
print(f'Hypoteza: stredni hodnota = {K}')
# Skupina "Control"
print('\nSkupina "Control":')
res_ctl = check_hypotesis(K ,IS_c)
print('\nSkupina "Bacilli":')
res_ctl = check_hypotesis(K ,IS_b)
Hypoteza: stredni hodnota = 20
Skupina "Control":
Hypotezu odmitame.
Skupina "Bacilli":
Hypotezu odmitame.
# code
X_ctl = ctl['Lifetime'].values
X_bcl = bcl['Lifetime'].values
X_ctl = np.random.choice(X_ctl, size=len(X_bcl))
n = len(X_ctl)
ctl_m = X_ctl.mean()
ctl_v = X_ctl.var()
bcl_m = X_bcl.mean()
Z_mean = bcl_m - ctl_m
Z_var = bcl_var + ctl_v - 2*np.cov(X_ctl, X_bcl)
print(f'Prumer rodilu: {Z_mean}')
print(f'Kovariancni matice rozdilu: \n{Z_var}')
Prumer rodilu: -96.51724137931035
Kovariancni matice rozdilu:
[[-32608.666 73590.894]
[ 73590.894 31031.84 ]]
#H_0: mean_Z = 0
#H_a: mean != 0
Z_variance = 73464.422
alpha = 0.05
IS_z = calculate_full_ci(Z, None, 'Prumer rozdilu', alpha, logs=True)
print('Hypotezu H_0 odmitame.')
Skupina Prumer rozdilu:
2.5% kritická hodnota rozdělení t:
t(0.025)=2.0025
Oboustranný IS: [ 26.074 169.443]
Hypotezu H_0 odmitame.