import pandas as pd
df1=pd.read_csv('data1.csv')
df1.head()
x=df1['x']
z=df1['z']
from scipy.stats import chi2
import matplotlib.pyplot as plt
import numpy as np
a,loc,scale=chi2.fit(x)
print('Parameters a:',a,'loc:',loc,'scale:',scale)
x_gen = np.linspace(-4, 10, 1000)
plt.plot(x_gen, chi2.pdf(x_gen, a, loc,scale))
plt.title('PDF of chi2(158.33,-15.2,0.11)')
print(chi2.cdf(2.5,a,loc,scale)-chi2.cdf(0.8,a,loc,scale))
p0=chi2.cdf(2.5,a,loc,scale)-chi2.cdf(0.8,a,loc,scale)
p1=1-chi2.cdf(1.2,a,loc,scale)
print(p1)
print(p0*p1)
import math
llf_data=[]
for i in x:
lf=chi2.pdf(i,a,loc,scale)
llf_data.append(math.log(lf))
print(sum(llf_data))
from scipy.stats import norm
sm,sstd=norm.fit(x)
llf_data2=[]
for i in x:
lf=norm.pdf(i,sm,sstd)
llf_data2.append(math.log(lf))
print(sum(llf_data2))
import statistics
import random
import numpy as np
N=len(z)
mode_hat=statistics.mode(z)
m=[]
e=[]
for i in range(100):
rs=random.choices(z, k=N)
m.append(statistics.mode(rs))
e.append(statistics.mode(rs)-mode_hat)
q0=np.quantile(e, 0.05)
q1=np.quantile(e, 0.95)
print('90% CI:[',mode_hat-q1,',',mode_hat-q0,']')
import matplotlib.pyplot as plt
min_x=min(x)
max_x=max(x)
range_x=max_x-min_x
print(max_x)
step=range_x/5
bins=[min_x+step,min_x+2*step,min_x+3*step,min_x+4*step,min_x+5*step]
print(bins)
x_sorted=x.sort_values()
f1=f2=f3=f4=f5=0
for i in x_sorted:
if i<=bins[0]:
f1+=1
elif i>bins[0] and i<=bins[1]:
f2+=1
elif i>bins[1] and i<=bins[2]:
f3+=1
elif i>bins[2] and i<=bins[3]:
f4+=1
else:
f5+=1
f_x=[f1,f2,f3,f4,f5]
plt.bar(bins,f_x)
plt.show()
min_z=min(z)
max_z=max(z)
range_z=max_z-min_z
step=range_z/5
binsz=[min_z+step,min_z+2*step,min_z+3*step,min_z+4*step,min_z+5*step]
print(bins)
z_sorted=z.sort_values()
y1=y2=y3=y4=y5=0
for i in z_sorted:
if i<=binsz[0]:
y1+=1
elif i>binsz[0] and i<=binsz[1]:
y2+=1
elif i>binsz[1] and i<=binsz[2]:
y3+=1
elif i>binsz[2] and i<=binsz[3]:
y4+=1
else:
y5+=1
y_z=[y1,y2,y3,y4,y5]
plt.bar(binsz,y_z)
plt.show()
train=pd.read_csv('data2_train.csv')
train.head()
v1=train['var1']
v2=train['var2']
y=train['target']
from scipy.stats import beta
#estimate parameters in likelihood function
y0=train[train['target']==0]
v2_c0=y0['var2']
y1=train[train['target']==1]
v2_c1=y1['var2']
pars_c0=beta.fit(v2_c0)
pars_c1=beta.fit(v2_c1)
#estimate priors
p_c0=len(v2_c0)/len(v2)
p_c1=len(v2_c1)/len(v2)
print(p_c0+p_c1)
c0_score=p_c0*beta.pdf(0.27,pars_c0[0],pars_c0[1],pars_c0[2],pars_c0[3])
c1_score=p_c1*beta.pdf(0.27,pars_c1[0],pars_c1[1],pars_c1[2],pars_c1[3])
print(c0_score,c1_score)
from scipy.stats import binom
X=y0['var1']
plt.hist(v10)
bins = np.linspace(np.min(X),np.max(X),100)
X=np.array(X)
print("Dataset shape:", X.shape)
def pdf(data, samplesize: float, prob: float):
s1 = binom.pmf(data, samplesize, prob)
return s1
# define the number of clusters to be learned
k = 3
weights = np.ones((k)) / k
samplesizes = np.random.choice(X, k)
probs = np.random.random_sample(size=k)
print(samplesizes, probs)
eps=1e-8
for step in range(25):
if step % 1 == 0:
plt.figure(figsize=(10,6))
axes = plt.gca()
plt.xlabel("$x$")
plt.ylabel("pdf")
plt.title("Iteration {}".format(step))
plt.scatter(X, [0.005] * len(X), color='navy', s=30, marker=2, label="Train data")
plt.plot(bins, pdf(bins, n1, p1), color='grey', label="True pdf")
plt.plot(bins, pdf(bins, n2, p2), color='grey')
plt.plot(bins, pdf(bins, n3, p3), color='grey')
plt.plot(bins, pdf(bins, samplesizes[0], probs[0]), color='blue', label="Cluster 1")
plt.plot(bins, pdf(bins, samplesizes[1], probs[1]), color='green', label="Cluster 2")
plt.plot(bins, pdf(bins, samplesizes[2], probs[2]), color='magenta', label="Cluster 3")
plt.legend(loc='upper left')
plt.savefig("img_{0:02d}".format(step), bbox_inches='tight')
plt.show()
# calculate the maximum likelihood of each observation xi
likelihood = []
# Expectation step
for j in range(k):
likelihood.append(pdf(X, samplesizes[j], np.sqrt(probs[j])))
likelihood = np.array(likelihood)
b = []
# Maximization step
for j in range(k):
# use the current values for the parameters to evaluate the posterior
# probabilities of the data to have been generanted by each gaussian
b.append((likelihood[j] * weights[j]) / (np.sum([likelihood[i] * weights[i] for i in range(k)], axis=0)+eps))
# updage sample size and probabilities
probs[j] = np.sum(b[j] * X) / (np.sum(b[j]+eps))
sample_size[j] = len(b[j])
# update the weights
weights[j] = np.mean(b[j])
#after running the algorithm severeal times the final parameters are
print('Means:',means,'Stds:',variances**2)
print('Pis:',weights)
test=np.linspace(0, 2, 1000)
pdf1=binomial.pmf(test, samplesizes[0], probs[0]**2)
pdf2=binomial.pmf(test, samplesizes[1], probs[1]**2)
pdf3=binomial.pmf(test, samplesizes[2], probs[2]**2)
mix_pdf=weights[0]*pdf1+weights[1]*pdf2+weights[2]*pdf3
plt.plot(test,mix_pdf)
X_0=train[train['target']==0]
x01=X_0['var1']
x02=X_0['var2']
X_1=train[train['target']==1]
x11=X_1['var1']
x12=X_1['var2']
#dparameters for likelihoods
d01=len(train['var1']),x01.mean()
d02=len(train['var2']),x02.mean()
d11=stats.beta.fit(x11)
d12=stats.beta.fit(x12)
print(d01,d02)
print(d11,d12)
#priors
p0=len(X_0)/len(train)
p1=len(X_1)/len(train)
y_test=pd.read_csv('data2_test_label.csv')
X_test=pd.read_csv('data2_test.csv')
y_test.head()
X_test.head()
y_pred=[]
for i,j in zip(X_test['var1'],X_test['var2']):
s0=p0*stats.binom.pmf(i,d01[0],d01[1])*stats.beta.pdf(j,d11[0],d11[1],d11[2],d11[3])
s1=p1*stats.binom.pmf(i,d02[0],d02[1])*stats.beta.pdf(j,d12[0],d12[1],d12[2],d12[3])
if s0>s1:
y_pred.append(0)
else:
y_pred.append(1)
y_pred=np.array(y_pred)
y_test=np.array(y_test)
def F1(pred,true, T, N1):
TP=0
TN=0
FP=0
FN=0
for i in range(len(pred)):
if pred[i]==T:
if true[i]==T:
TP+=1
else:
FP+=1
if pred[i]==N1:
if true[i]==N1:
TN+=1
else:
FN+=1
prec=TP/(TP+FP)
rec=TP/(TP+FN)
F1=2*prec*rec / (prec+rec)
return F1
print('F-score:',F1(y_pred,y_test,1,0))
df3=pd.read_csv('data3.csv')
df3.head()
df3['new feature'].value_counts()
df3['new feature'].replace(to_replace="True", value=1, inplace=True)
df3['new feature'].replace(to_replace="False", value=0, inplace=True)
from scipy.stats import bernoulli
nf=df3['new feature']
pt=sum(nf)/len(df3)
print(pt)
pf=1-pt
plt.bar(['Without','With'], [pf,pt], color ='blue',
width = 0.4)
plt.title("Distribution function of number of gamers playing with and without the new feature")
y0=df3[df3['new feature']==0]
y1=df3[df3['new feature']==1]
x0=y0['hours']
x1=y1['hours']
#3)Tests statistic:
sm0=x0.mean()
sm1=x1.mean()
sstd0=x0.std()
sstd1=x1.std()
N0=len(x0)
N1=len(x1)
t0=(sm0-sm1-0) / math.sqrt((sstd0**2/N0)+(sstd1**2/N1))
df=((sstd0**2/N0)+(sstd1**2/N1))**2 /(((sstd0**2/N0)/ (N0-1))+((sstd1**2/N1)/ (N1-1)))
#4)Null distribution
from scipy.stats import t
x = np.linspace(-4, 4, 1000)
plt.plot(x, t.pdf(x, df))
#5)p-value
p=2*min(t.cdf(t0,df=df),1-t.cdf(t0,df=df))
print(p)
xa=df3['hours']
xb=df3['monthly hours (before experiment)']
#3)Tests statistic:
df3['hours_diff'] = df3['hours'] - df3['monthly hours (before experiment)']
N=len(df3)
m=df3['hours_diff'].mean()
dif2=[]
for i in df3['hours_diff']:
dif2.append((i-m)**2)
std=math.sqrt( (1/(N-1)*sum(dif2)))
t0= (m-0) / (std/math.sqrt(N))
print(t0)
#4)Null distribution
x = np.linspace(-4, 4, 1000)
plt.plot(x, t.pdf(x, N-1))
#5)p-value
p=2*min(t.cdf(t0,df=df),1-t.cdf(t0,df=df))
print(p)
import scipy.stats as stats
#test 1
alpha=0.01
x = np.linspace(-4, 8, 1000)
H0_1=t.pdf(x,df)
HA_1=t.pdf(x-2.5,df)
q0=t.ppf(x,0.005)
confidence_level=1-alpha
q01=t.ppf(alpha/2,df)
q11=t.ppf(confidence_level+alpha/2,df)
#print(q01,q11)
plt.plot(x,H0_1)
plt.fill_between(
x= x,
y1= t.pdf(x,df),
where= (q11 < x)&(x < 4),
color= "r")
plt.plot(x,HA_1)
plt.fill_between(
x= x,
y1= HA_1,
where= (q11 < x)&(x < 8),
color= "r")
plt.show()
#the power of the first test is:
power1=1-t.cdf(q11-2.5,df=df)
print("Power test 1:",power1)
#test 2
H0_2=t.pdf(x,N-1)
HA_2=t.pdf(x-2.5,N-1)
q02=t.ppf(alpha/2,N-1)
q12=t.ppf(confidence_level+alpha/2,N-1)
#print(q02,q12)
plt.plot(x,H0_2)
plt.fill_between(
x= x,
y1= H0_2,
where= (q12 < x)&(x < 4),
color= "r")
plt.plot(x,HA_2)
plt.fill_between(
x= x,
y1= HA_2,
where= (q12 < x)&(x < 8),
color= "r")
plt.show()
#the power of the second test is:
power2=1-t.cdf(q12-2.5,df=N-1)
print("Power test 2:",power2)
#the critical value should be the mean-c/std *sqrt(N), without the N part this is:
d=2.5/std
print(d)
z1_a2=norm.ppf(0.01/2)
#and z-value for 1-power
z1_b=norm.ppf(1-0.8)
print(z0,z1)
#then for a z-test the n required is:
n_required=(z0+z1/d)**2
print(n_required)
#create dataframe of adolescents
print(df3['age'].sort_values())
#no gamer under 10
df_adol=df3[df3['age']<=19]
#only looking at those who got feature
df_feature=df_adol[df_adol['new feature']==1]
#create seperate sets for #addicts and #no addict before
df_abf=df_feature[df_feature['monthly hours (before experiment)']>85]
df_nbf=df_feature[df_feature['monthly hours (before experiment)']<=85]
#create seperate sets for #addicts and #no addict after feature given
df_aaf=len(df_feature[df_feature['hours']>85])
df_naf=len(df_feature[df_feature['hours']<=85])
#Not addicts before and notafter:
n00=len(df_nbf[df_nbf['hours']<=85])
#Not addicts before and but addicts after:
n01=len(df_nbf[df_nbf['hours']>85])
#Addicts before and addicts after:
n11=len(df_abf[df_abf['hours']>85])
#Addicts before and not addicts after:
n10=len(df_abf[df_abf['hours']<=85])
#Tests statistic:
print('discordance:',n01+n10)
#so small discordance:
s=min(n01,n10)
print(s)
#Null dist: Bin(01,n10)
#p-value
from scipy.stats import binom
p=1-stats.binom.cdf(s,n01+n10,0.5)
print(p)