import numpy as np
import pandas as pd
import scipy.stats as ss
import matplotlib.pyplot as plt
import rdata
np.set_printoptions(precision=3)
#Load dataset
parsed = rdata.parser.parse_file("case0102.rda")
converted = rdata.conversion.convert(parsed)
data = converted['case0102']
#Split dataset
male = data[data['Sex'] == 'Male']
female = data[data['Sex'] == 'Female']
mean_m = male['Salary'].mean()
median_m = male['Salary'].median()
variance_m = male['Salary'].var()
mean_f = female['Salary'].mean()
median_f = female['Salary'].median()
variance_f = female['Salary'].var()
print(f"Muži: stredná hodnota: {mean_m:.3f}, medián: {median_m:.3f}, rozptyl: {variance_m:.3f}")
print(f"Ženy: stredná hodnota: {mean_f:.3f}, medián: {median_f:.3f}, rozptyl: {variance_f:.3f}")
fig, axes = plt.subplots(2)
fig.tight_layout()
axes[0].set_title('Plat muži')
axes[1].set_title('Plat ženy')
male['Salary'].hist(ax=axes[0])
female['Salary'].hist(ax=axes[1], color = "orange")
def ecdf(a):
x, counts = np.unique(a, return_counts=True)
cusum = np.cumsum(counts)
return x, cusum / cusum[-1]
x_m, y_m = ecdf(male['Salary'])
x_f, y_f = ecdf(female['Salary'])
x_m = np.insert(x_m, 0, x_m[0])
x_f = np.insert(x_f, 0, x_f[0])
y_m = np.insert(y_m, 0, 0.)
y_f = np.insert(y_f, 0, 0.)
plt.title('Empirická distribučná funckia')
plt.plot(x_m, y_m, drawstyle='steps-post', label = "EDF muži")
plt.plot(x_f, y_f, drawstyle='steps-post', label = "EDF ženy")
plt.legend()
plt.grid(True)
male_hist = plt.hist(male['Salary'], density=True)
#Odhad lambda exponenciálneho rozdelenia pre mužov
lambda_hat = 1 / mean_m
#Odhad mi a sigma normálneho rozdelenia pre mužov
mi_hat_m = mean_m
sigma_hat_m = np.sqrt(variance_m)
#Odhad a a b rovnomerného rozdelenia pre mužov
m1 = mean_m
m2 = (male['Salary'] ** 2).sum() / male['Salary'].size
a_hat = m1 - np.sqrt(3 * (m2 - m1 ** 2))
b_hat = m1 + np.sqrt(3 * (m2 - m1 ** 2))
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
e = ss.expon.pdf(x, scale = 1 / lambda_hat)
n = ss.norm.pdf(x, loc=mi_hat_m, scale=sigma_hat_m)
u = ss.uniform.pdf(x, loc=a_hat, scale=np.abs(b_hat - a_hat))
plt.plot(x, e, label = "Exponencialne rozdelenie")
plt.plot(x, n, label = "Normálne rozdelenie")
plt.plot(x, u, label = "Rovnomerné rozdelenie")
plt.legend()
print(f"Odhad lambda muži {lambda_hat:.8f}")
print(f"Odhad a, b muži: {a_hat:.3f}, {b_hat:.3f}")
print(f"Odhad sigma, mi muži: {sigma_hat_m:.3f}, {mi_hat_m:.3f}")
female_hist = plt.hist(female['Salary'], density=True)
#Odhad lambda exponenciálneho rozdelenia pre mužov
lambda_hat= 1 / mean_f
#Odhad mi a sigma normálneho rozdelenia pre mužov
mi_hat_f = mean_f
sigma_hat_f = np.sqrt(variance_f)
#Odhad a a b rovnomerného rozdelenia pre mužov
m1 = mean_f
m2 = (female['Salary'] ** 2).sum() / female['Salary'].size
a_hat = m1 - np.sqrt(3 * (m2 - m1 ** 2))
b_hat = m1 + np.sqrt(3 * (m2 - m1 ** 2))
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
e = ss.expon.pdf(x, scale = 1 / lambda_hat)
n = ss.norm.pdf(x, loc=mi_hat_f, scale=sigma_hat_f)
u = ss.uniform.pdf(x, loc=a_hat, scale=np.abs(b_hat - a_hat))
plt.plot(x, e, label = "Exponencialne rozdelenie")
plt.plot(x, n, label = "Normálne rozdelenie")
plt.plot(x, u, label = "Rovnomerné rozdelenie")
plt.legend()
print(f"Odhad lambda ženy {lambda_hat:.8f}")
print(f"Odhad a, b ženy: {a_hat:.3f}, {b_hat:.3f}")
print(f"Odhad sigma, mi žemy: {sigma_hat_f:.3f}, {mi_hat_f:.3f}")
random_val = pd.DataFrame(ss.norm.rvs(loc=mi_hat_m, scale=sigma_hat_m, size=100))
fig, axes = plt.subplots(2)
fig.tight_layout()
male['Salary'].hist(ax=axes[0])
random_val.hist(ax=axes[1])
axes[0].set_title('Plat muži')
axes[1].set_title('Náhodný výber')
random_normal_val = pd.DataFrame(ss.norm.rvs(loc=mi_hat_f, scale=sigma_hat_f, size=100))
fig, axes = plt.subplots(2)
fig.tight_layout()
female['Salary'].hist(ax=axes[0], color = "orange")
random_normal_val.hist(ax=axes[1], color = "orange")
axes[0].set_title('Plat ženy')
axes[1].set_title('Náhodný výber')
def get_confidence_interval(alpha, mean, variance, n):
critical_value = ss.norm.isf(alpha/2)
delta = critical_value * np.sqrt(variance) / np.sqrt(n)
ci = np.array([mean - delta, mean + delta])
print('Obojstranný interval spolahlivosti: ', ci)
print(f'Reálna stredná hodnota: {mean:.3f}')
print('Muži:')
get_confidence_interval(0.05, mean_m, variance_m, male.size)
print('Ženy:')
get_confidence_interval(0.05, mean_f, variance_f, female.size)
s, p = ss.ttest_ind(male['Salary'], female['Salary'], equal_var=False)
print(f"p-hodnota {p:.10f} je nižšia ako α = 5%, takže H_0 na hladine významnosti 5% zamietame.")