Take home exam

import pandas as pd df1=pd.read_csv('data1.csv') df1.head() x=df1['x'] z=df1['z']

from scipy.stats import chi2 import matplotlib.pyplot as plt import numpy as np a,loc,scale=chi2.fit(x) print('Parameters a:',a,'loc:',loc,'scale:',scale) x_gen = np.linspace(-4, 10, 1000) plt.plot(x_gen, chi2.pdf(x_gen, a, loc,scale)) plt.title('PDF of chi2(158.33,-15.2,0.11)')

print(chi2.cdf(2.5,a,loc,scale)-chi2.cdf(0.8,a,loc,scale))

p0=chi2.cdf(2.5,a,loc,scale)-chi2.cdf(0.8,a,loc,scale) p1=1-chi2.cdf(1.2,a,loc,scale) print(p1) print(p0*p1)

import math llf_data=[] for i in x: lf=chi2.pdf(i,a,loc,scale) llf_data.append(math.log(lf)) print(sum(llf_data))

from scipy.stats import norm sm,sstd=norm.fit(x) llf_data2=[] for i in x: lf=norm.pdf(i,sm,sstd) llf_data2.append(math.log(lf)) print(sum(llf_data2))

import statistics import random import numpy as np N=len(z) mode_hat=statistics.mode(z) m=[] e=[] for i in range(100): rs=random.choices(z, k=N) m.append(statistics.mode(rs)) e.append(statistics.mode(rs)-mode_hat) q0=np.quantile(e, 0.05) q1=np.quantile(e, 0.95) print('90% CI:[',mode_hat-q1,',',mode_hat-q0,']')

import matplotlib.pyplot as plt min_x=min(x) max_x=max(x) range_x=max_x-min_x print(max_x) step=range_x/5 bins=[min_x+step,min_x+2*step,min_x+3*step,min_x+4*step,min_x+5*step] print(bins) x_sorted=x.sort_values() f1=f2=f3=f4=f5=0 for i in x_sorted: if i<=bins[0]: f1+=1 elif i>bins[0] and i<=bins[1]: f2+=1 elif i>bins[1] and i<=bins[2]: f3+=1 elif i>bins[2] and i<=bins[3]: f4+=1 else: f5+=1 f_x=[f1,f2,f3,f4,f5] plt.bar(bins,f_x) plt.show() min_z=min(z) max_z=max(z) range_z=max_z-min_z step=range_z/5 binsz=[min_z+step,min_z+2*step,min_z+3*step,min_z+4*step,min_z+5*step] print(bins) z_sorted=z.sort_values() y1=y2=y3=y4=y5=0 for i in z_sorted: if i<=binsz[0]: y1+=1 elif i>binsz[0] and i<=binsz[1]: y2+=1 elif i>binsz[1] and i<=binsz[2]: y3+=1 elif i>binsz[2] and i<=binsz[3]: y4+=1 else: y5+=1 y_z=[y1,y2,y3,y4,y5] plt.bar(binsz,y_z) plt.show()

train=pd.read_csv('data2_train.csv') train.head() v1=train['var1'] v2=train['var2'] y=train['target']

from scipy.stats import beta #estimate parameters in likelihood function y0=train[train['target']==0] v2_c0=y0['var2'] y1=train[train['target']==1] v2_c1=y1['var2'] pars_c0=beta.fit(v2_c0) pars_c1=beta.fit(v2_c1) #estimate priors p_c0=len(v2_c0)/len(v2) p_c1=len(v2_c1)/len(v2) print(p_c0+p_c1) c0_score=p_c0*beta.pdf(0.27,pars_c0[0],pars_c0[1],pars_c0[2],pars_c0[3]) c1_score=p_c1*beta.pdf(0.27,pars_c1[0],pars_c1[1],pars_c1[2],pars_c1[3]) print(c0_score,c1_score)

from scipy.stats import binom X=y0['var1'] plt.hist(v10) bins = np.linspace(np.min(X),np.max(X),100) X=np.array(X) print("Dataset shape:", X.shape) def pdf(data, samplesize: float, prob: float): s1 = binom.pmf(data, samplesize, prob) return s1 # define the number of clusters to be learned k = 3 weights = np.ones((k)) / k samplesizes = np.random.choice(X, k) probs = np.random.random_sample(size=k) print(samplesizes, probs) eps=1e-8 for step in range(25): if step % 1 == 0: plt.figure(figsize=(10,6)) axes = plt.gca() plt.xlabel("$x$") plt.ylabel("pdf") plt.title("Iteration {}".format(step)) plt.scatter(X, [0.005] * len(X), color='navy', s=30, marker=2, label="Train data") plt.plot(bins, pdf(bins, n1, p1), color='grey', label="True pdf") plt.plot(bins, pdf(bins, n2, p2), color='grey') plt.plot(bins, pdf(bins, n3, p3), color='grey') plt.plot(bins, pdf(bins, samplesizes[0], probs[0]), color='blue', label="Cluster 1") plt.plot(bins, pdf(bins, samplesizes[1], probs[1]), color='green', label="Cluster 2") plt.plot(bins, pdf(bins, samplesizes[2], probs[2]), color='magenta', label="Cluster 3") plt.legend(loc='upper left') plt.savefig("img_{0:02d}".format(step), bbox_inches='tight') plt.show() # calculate the maximum likelihood of each observation xi likelihood = [] # Expectation step for j in range(k): likelihood.append(pdf(X, samplesizes[j], np.sqrt(probs[j]))) likelihood = np.array(likelihood) b = [] # Maximization step for j in range(k): # use the current values for the parameters to evaluate the posterior # probabilities of the data to have been generanted by each gaussian b.append((likelihood[j] * weights[j]) / (np.sum([likelihood[i] * weights[i] for i in range(k)], axis=0)+eps)) # updage sample size and probabilities probs[j] = np.sum(b[j] * X) / (np.sum(b[j]+eps)) sample_size[j] = len(b[j]) # update the weights weights[j] = np.mean(b[j]) #after running the algorithm severeal times the final parameters are print('Means:',means,'Stds:',variances**2) print('Pis:',weights) test=np.linspace(0, 2, 1000) pdf1=binomial.pmf(test, samplesizes[0], probs[0]**2) pdf2=binomial.pmf(test, samplesizes[1], probs[1]**2) pdf3=binomial.pmf(test, samplesizes[2], probs[2]**2) mix_pdf=weights[0]*pdf1+weights[1]*pdf2+weights[2]*pdf3 plt.plot(test,mix_pdf)

X_0=train[train['target']==0] x01=X_0['var1'] x02=X_0['var2'] X_1=train[train['target']==1] x11=X_1['var1'] x12=X_1['var2'] #dparameters for likelihoods d01=len(train['var1']),x01.mean() d02=len(train['var2']),x02.mean() d11=stats.beta.fit(x11) d12=stats.beta.fit(x12) print(d01,d02) print(d11,d12) #priors p0=len(X_0)/len(train) p1=len(X_1)/len(train) y_test=pd.read_csv('data2_test_label.csv') X_test=pd.read_csv('data2_test.csv') y_test.head() X_test.head() y_pred=[] for i,j in zip(X_test['var1'],X_test['var2']): s0=p0*stats.binom.pmf(i,d01[0],d01[1])*stats.beta.pdf(j,d11[0],d11[1],d11[2],d11[3]) s1=p1*stats.binom.pmf(i,d02[0],d02[1])*stats.beta.pdf(j,d12[0],d12[1],d12[2],d12[3]) if s0>s1: y_pred.append(0) else: y_pred.append(1)

y_pred=np.array(y_pred) y_test=np.array(y_test) def F1(pred,true, T, N1): TP=0 TN=0 FP=0 FN=0 for i in range(len(pred)): if pred[i]==T: if true[i]==T: TP+=1 else: FP+=1 if pred[i]==N1: if true[i]==N1: TN+=1 else: FN+=1 prec=TP/(TP+FP) rec=TP/(TP+FN) F1=2*prec*rec / (prec+rec) return F1 print('F-score:',F1(y_pred,y_test,1,0))

df3=pd.read_csv('data3.csv') df3.head()

df3['new feature'].value_counts()

df3['new feature'].replace(to_replace="True", value=1, inplace=True) df3['new feature'].replace(to_replace="False", value=0, inplace=True)

from scipy.stats import bernoulli nf=df3['new feature'] pt=sum(nf)/len(df3) print(pt) pf=1-pt plt.bar(['Without','With'], [pf,pt], color ='blue', width = 0.4) plt.title("Distribution function of number of gamers playing with and without the new feature")

y0=df3[df3['new feature']==0] y1=df3[df3['new feature']==1] x0=y0['hours'] x1=y1['hours']

#3)Tests statistic: sm0=x0.mean() sm1=x1.mean() sstd0=x0.std() sstd1=x1.std() N0=len(x0) N1=len(x1) t0=(sm0-sm1-0) / math.sqrt((sstd0**2/N0)+(sstd1**2/N1)) df=((sstd0**2/N0)+(sstd1**2/N1))**2 /(((sstd0**2/N0)/ (N0-1))+((sstd1**2/N1)/ (N1-1)))

#4)Null distribution from scipy.stats import t x = np.linspace(-4, 4, 1000) plt.plot(x, t.pdf(x, df))

#5)p-value p=2*min(t.cdf(t0,df=df),1-t.cdf(t0,df=df)) print(p)

xa=df3['hours'] xb=df3['monthly hours (before experiment)']

#3)Tests statistic: df3['hours_diff'] = df3['hours'] - df3['monthly hours (before experiment)'] N=len(df3) m=df3['hours_diff'].mean() dif2=[] for i in df3['hours_diff']: dif2.append((i-m)**2) std=math.sqrt( (1/(N-1)*sum(dif2))) t0= (m-0) / (std/math.sqrt(N)) print(t0)

#4)Null distribution x = np.linspace(-4, 4, 1000) plt.plot(x, t.pdf(x, N-1))

#5)p-value p=2*min(t.cdf(t0,df=df),1-t.cdf(t0,df=df)) print(p)

import scipy.stats as stats #test 1 alpha=0.01 x = np.linspace(-4, 8, 1000) H0_1=t.pdf(x,df) HA_1=t.pdf(x-2.5,df) q0=t.ppf(x,0.005) confidence_level=1-alpha q01=t.ppf(alpha/2,df) q11=t.ppf(confidence_level+alpha/2,df) #print(q01,q11) plt.plot(x,H0_1) plt.fill_between( x= x, y1= t.pdf(x,df), where= (q11 < x)&(x < 4), color= "r") plt.plot(x,HA_1) plt.fill_between( x= x, y1= HA_1, where= (q11 < x)&(x < 8), color= "r") plt.show() #the power of the first test is: power1=1-t.cdf(q11-2.5,df=df) print("Power test 1:",power1) #test 2 H0_2=t.pdf(x,N-1) HA_2=t.pdf(x-2.5,N-1) q02=t.ppf(alpha/2,N-1) q12=t.ppf(confidence_level+alpha/2,N-1) #print(q02,q12) plt.plot(x,H0_2) plt.fill_between( x= x, y1= H0_2, where= (q12 < x)&(x < 4), color= "r") plt.plot(x,HA_2) plt.fill_between( x= x, y1= HA_2, where= (q12 < x)&(x < 8), color= "r") plt.show() #the power of the second test is: power2=1-t.cdf(q12-2.5,df=N-1) print("Power test 2:",power2)

#the critical value should be the mean-c/std *sqrt(N), without the N part this is: d=2.5/std print(d) z1_a2=norm.ppf(0.01/2) #and z-value for 1-power z1_b=norm.ppf(1-0.8) print(z0,z1) #then for a z-test the n required is: n_required=(z0+z1/d)**2 print(n_required)

#create dataframe of adolescents print(df3['age'].sort_values()) #no gamer under 10 df_adol=df3[df3['age']<=19] #only looking at those who got feature df_feature=df_adol[df_adol['new feature']==1] #create seperate sets for #addicts and #no addict before df_abf=df_feature[df_feature['monthly hours (before experiment)']>85] df_nbf=df_feature[df_feature['monthly hours (before experiment)']<=85] #create seperate sets for #addicts and #no addict after feature given df_aaf=len(df_feature[df_feature['hours']>85]) df_naf=len(df_feature[df_feature['hours']<=85]) #Not addicts before and notafter: n00=len(df_nbf[df_nbf['hours']<=85]) #Not addicts before and but addicts after: n01=len(df_nbf[df_nbf['hours']>85]) #Addicts before and addicts after: n11=len(df_abf[df_abf['hours']>85]) #Addicts before and not addicts after: n10=len(df_abf[df_abf['hours']<=85])

#Tests statistic: print('discordance:',n01+n10) #so small discordance: s=min(n01,n10) print(s) #Null dist: Bin(01,n10)

#p-value from scipy.stats import binom p=1-stats.binom.cdf(s,n01+n10,0.5) print(p)