import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from scipy.stats import t
df1 = pd.read_csv("data1.csv")
def mean(df):
return np.sum(df)/len(df)
def std(df):
DF = len(df)-1
m = mean(df)
ssq = np.sum(np.power(df-m,2))
return np.sqrt(ssq/DF)
N=len(df1["x"])
DF = N-1
t0975 = stats.t.ppf(0.975,DF, 0,1)
x_bar = mean(df1["x"])
s = std(df1["x"])
CI = [x_bar-t0975*s/np.sqrt(N), x_bar+t0975*s/np.sqrt(N)]
print(CI)
[3.336200510068722, 3.564899310086135]
mu = mean(df1["x"])
var = np.power(std(df1["x"]), 2)
print("The expected value: ", mu)
print()
print("The variance: ", var)
The expected value: 3.4505499100774286
The variance: 2.7012283035501863
m = mean(df1["x"])
s = std(df1["x"])
print(np.exp(-np.power((3-m)/s, 2)/2)/(s*np.sqrt(2*np.pi)))
0.23378194768352722
N = len(df1["x"])
m = mean(df1["x"])
s = std(df1["x"])
N*np.log(1/(s*np.sqrt(2*np.pi))) - np.sum(np.power(df1["x"]-m,2))/(2*np.power(s,2))
mini = np.min(df1["x"])
maxi = np.max(df1["x"])
r = (maxi - mini)/5
bins = [r*i + mini for i in range(0,6)]
count = []
for i in range(len(bins)-1):
if i < len(bins)-2:
count.append(len(df1[(df1['x'] >= bins[i]) & (df1['x'] < bins[i+1])]))
else:
count.append(len(df1[(df1['x'] >= bins[i])]))
print("Edges of bins: ", bins)
print("How many values in each bin: ", count)
Edges of bins: [0.0180593017749679, 2.7016733440153002, 5.385287386255633, 8.068901428495966, 10.752515470736299, 13.436129512976631]
How many values in each bin: [256, 449, 89, 1, 1]
df2 = pd.read_csv("data2.csv")
df2[df2["split"] == "train"]["x2"]
freq = {}
for i in df2[df2["split"] == "train"]["x2"]:
if i in freq:
freq[i] += 1
else:
freq[i] = 1
relfreq = {}
for i in freq:
relfreq[i] = freq[i]/sum(freq.values())
for i in relfreq:
print("class and relative frequency: ", i, relfreq[i])
class and relative frequency: c 0.25837676842889057
class and relative frequency: b 0.4043186895011169
class and relative frequency: a 0.33730454206999255
ddof0 = len(df2[df2["y"]==0]["x1"])-1
ddof1 = len(df2[df2["y"]==1]["x1"])-1
x = np.linspace(stats.f.ppf(0.001, ddof1, ddof0),stats.f.ppf(0.999, ddof1, ddof0), 100)
plt.plot(x, stats.f.pdf(x, ddof1, ddof0),'r-', lw=5, alpha=0.6, label='f pdf')
var_0 = np.var(df2[df2["y"]==0]["x1"], ddof=1)
var_1 = np.var(df2[df2["y"]==1]["x1"], ddof=1)
print("Var y = 0: ", var_0)
print("Var y = 1: ", var_1)
f = var_1/var_0
p = 2*(1-stats.f.cdf(f, ddof1, ddof0))
print("F statistic: ", f)
print("P-value: ", p)
Var y = 0: 0.019682844048276728
Var y = 1: 0.020881642051954318
F statistic: 1.0609057309369145
P-value: 0.505932297463624
stats.f.ppf(0.05, ddof1, ddof0)
mu = mean(df2["x1"])
var = np.power(std(df2["x1"]), 2)
print("The expected value: ", mu)
print()
print("The variance: ", var)
mini = np.min(df2["x1"])
maxi = np.max(df2["x1"])
x = np.linspace(mini, maxi, 1000)
y = stats.norm.pdf(x, mu, np.sqrt(var))
plt.plot(x,y)
plt.show()
The expected value: 0.5577487640411114
The variance: 0.4332258709089382
p = stats.norm.cdf(0.4, mu, np.sqrt(var))- stats.norm.cdf(0.2,mu,np.sqrt(var))
print(p)
0.11191051824624704
train = df2[df2["split"]=="train"]
def train(df):
mean = df.groupby("y").mean().reset_index()
prior = {}
llh_d = {}
llh_c = {}
df3 = pd.read_csv("data3.csv")
df3
active_timefloat64
7.7300214972121 - 72.0173197480512
groupint64
1 - 4
70
39.35305367816489
1
71
46.083600636655966
1
72
53.794087066072834
1
73
56.74547014923131
1
74
34.11157560311092
1
75
50.19574708564287
1
76
45.08254884668923
1
77
50.68200424211341
1
78
20.421565307135214
1
79
48.96003639946972
1
# TEST 1
x_bar = np.mean(df3[df3["group"] == 1]["active_time"])
y_bar = np.mean(df3[df3["group"] == 2]["active_time"])
N_x = len(df3[df3["group"] == 1]["active_time"])
N_y = len(df3[df3["group"] == 2]["active_time"])
s2_x = np.var(df3[df3["group"] == 1]["active_time"])
s2_y = np.var(df3[df3["group"] == 2]["active_time"])
dff = np.power(s2_x/N_x+s2_y/N_y, 2)/(np.power(s2_x/N_x, 2)/(N_x-1) + np.power(s2_y/N_y,2)/(N_y-1))
t0 = (x_bar-y_bar)/np.sqrt(s2_x/N_x + s2_y/N_y)
p = 2*min(stats.t.cdf(t0, dff), 1-t.cdf(t0, dff))
print("P-value of test 1: ", p)
#PDF
x = np.linspace(t.ppf(0.01, dff),
t.ppf(0.99, dff), 100)
plt.plot(x, t.pdf(x, dff),
'r-', lw=5, alpha=0.6, label='t pdf')
P-value of test 1: 0.01150029346070903
# TEST 2
# NOT ENOUGH TIME TO COMPUTE
v = df3.subject_id.value_counts()
dftest2 = df3[df3.subject_id.isin(v.index[v.gt(1)])].reset_index()
dfg1 = dftest2[dftest2["group"] == 1]
dfg3 = dftest2[dftest2["group"] == 3]
m = np.mean(df3)
x_bar = np.mean(df3[df3["group"] == 1]["active_time"])
y_bar = np.mean(df3[df3["group"] == 2]["active_time"])
N_x = len(df3[df3["group"] == 1]["active_time"])
N_y = len(df3[df3["group"] == 2]["active_time"])
s2_x = np.var(df3[df3["group"] == 1]["active_time"])
s2_y = np.var(df3[df3["group"] == 2]["active_time"])
t0
p = 2*min(stats.t.cdf(t0, dff), 1-t.cdf(t0, dff))
print("P-value of test 1: ", p)
#PDF
x = np.linspace(t.ppf(0.01, dff),
t.ppf(0.99, dff), 100)
plt.plot(x, t.pdf(x, dff),
'r-', lw=5, alpha=0.6, label='t pdf')
P-value of test 1: 0.01150029346070903