DIT862 ass1

# Start writing code here... import pandas as pd import numpy as np import matplotlib.pyplot as plt

df = pd.read_csv('houses.csv',header=None)

df[1].describe()

df[1].hist() plt.title("Histogram of house prices") plt.ylabel("Freq") plt.xlabel("Cost")

plt.plot(df[1]) plt.title("House prices") plt.xlabel("Index") plt.ylabel("Cost")

df95 = df[df[1] < df[1].quantile(.95)] #use quantile to subset df95.hist(bins=50) plt.title("Histogram of 95 quantile of house prices") plt.ylabel("Freq") plt.xlabel("Cost")

insideLondon = df95[df95[13] == "GREATER LONDON"] outsideLondon = df95[df95[13] != "GREATER LONDON"] # != not equal

import numpy as np import matplotlib.pyplot as plt x = insideLondon[1] y = outsideLondon[1] plt.hist([x, y], label=['Inside London', 'Outside London'], bins= 10) #plot both histograms in same plot plt.legend(loc='upper right') plt.title("Real estate compared inside and outside London") plt.xlabel("Cost") plt.ylabel("Freq") plt.show()

plt.hist([x, y], label=['Inside London', 'Outside London'], bins= 10,density=True) # we add "densitiy=true" plt.legend(loc='upper right') plt.title("Real estate compared inside and outside London") plt.xlabel("Cost") plt.ylabel("Density")

titanic = pd.read_csv('titanic_train.csv') titanic

from collections import Counter #We will focus on titanic[['Embarked','Pclass','Parch','Fare']] #emarked em = titanic['Embarked'] #subset data emCount = Counter(em) #Import Counter function #plt.bar(em) print(emCount) names = list(emCount.keys()) values = list(emCount.values()) valuesPMF = [x / len(em) for x in values] plt.bar(names, valuesPMF) plt.ylabel("f(x)"); plt.xlabel("Embarked Category")

#Pclass pclass = titanic['Pclass'] #subset data pclassCount = Counter(pclass) #Import Counter function print(pclassCount) names = list(pclassCount.keys()) values = list(pclassCount.values()) valuesPMF = [x / len(pclass) for x in values] plt.bar(names, valuesPMF) plt.ylabel("f(x)"); plt.xlabel("Passanger Class Category")

parch = titanic['Parch'] #subset data parchCount = Counter(parch) #Import Counter function print(parchCount) names = list(parchCount.keys()) values = list(parchCount.values()) valuesPMF = [x / len(parch) for x in values] plt.bar(names, valuesPMF) plt.ylabel("f(x)"); plt.xlabel("Parch") plt.title("estimated PMF of Parch")

#Fare = ticket price fare = titanic['Fare'] #subset data fare.hist(bins=100) plt.xlabel("fare") plt.ylabel("Freq")

# Make a histogram w/o using .hist() parch = titanic['Parch'] #subset data parchCount = Counter(parch) #Import Counter function print(parchCount) names = list(parchCount.keys()) values = list(parchCount.values()) valuesPMF = [x for x in values] plt.bar(names, valuesPMF, width=1) #if we use Width = 1 we basicly get "histogram" plt.ylabel("f(x)"); plt.xlabel("Parch") plt.title("Histogram/barplot for Parch")

plt.hist(parch,bins=7)

fare = titanic['Fare'] #subset data # quantization maxq = np.max(fare) minq = np.min(fare) # bins I = np.linspace(start=minq, stop=maxq, num=21) #To get same result as hist() with "bins=20" we need to have 21 points --> 20 bins

counterVal = np.zeros(21) #to fill with count for i in range(len(I)-1): # loop for each intervall for j in fare: #loop thoght all entries in titanic data if I[i]<j<I[i+1]: #if in intervall, count #print(j) counterVal[i] =counterVal[i]+1 counterVal

plt.bar(I, counterVal, width=I[1]) #if we use Width = I[1] we basicly get histogram plt.ylabel("Freq"); plt.xlabel("Fare") plt.title("Histogram/barplot for Fare")

#np.histogram(fare,bins=20) plt.hist(fare,bins=20) plt.title("histogram using hist()") plt.ylabel("Freq"); plt.xlabel("Fare")

len(fare) #we have 891 points, the 0,2 quantile will be at approx location 0.2*891 in a ordered array. 0.2*891 import math sortedFare = sorted(fare) #sort fare to take our quantiles l=len(fare) p = 20 # 0.2 = 20% q20 = sortedFare[int(math.ceil((l * p)/100)) - 1] # We sort array, then index for int closest to 0.2 quantile q20 == np.quantile(fare,0.2) #True assert(q20 == np.quantile(fare,0.2)) p = 50 #% = 0.5 q50 = sortedFare[int(math.ceil((l * p)/100)) - 1] # We sort array, then index for int closest to 0.5 quantile q50 == np.quantile(fare,0.5) #True assert(q50 == np.quantile(fare,0.5))

# Uniform for N in [10,100,1000,100000]: #diffrent sizes to compare x = np.random.rand(N) plt.hist(x, bins = int(np.sqrt(N))) plt.title('simulating n='+str(N)+' numbers from uniform distribution U(0,1)') plt.ylabel("Freq") plt.xlabel("x") plt.show()

import matplotlib.pyplot as plt import numpy as np x = np.random.normal(0, 1, 100) plt.hist(x, bins = int(np.sqrt(100))) plt.title("simulating n=100 numbers from Normal Distribution N(0,1) ") plt.ylabel("Freq") plt.show()

x = np.random.normal(0, 1, 10000) plt.hist(x, bins= int(np.sqrt(10000))) #rule of thumb for # of bins plt.title("simulating n=10000 numbers from Normal Distribution N(0,1) ") plt.ylabel("Freq") plt.show()

#b) def success(p_success): numb = np.random.rand() #random number [0,1] if numb < p_success: # if sicess return true else false return True else: return False def exam_score(p_correct, n_instances): # runs the function "successs() n times and counts number of right answers correct = []; for i in range(n_instances): correct.append(success(p_correct)) return sum(correct) exam_score(0.5,10)

# investigating dist N=10000 results= [] p=0.8 n=20 for i in range(N): #run N times results.append(exam_score(p,n))

plt.hist(results) plt.xlabel("Test score") plt.ylabel("Freq") plt.title("Histogram of test score distribution")

#c) #p_pass = 0.4 def number_of_attempts(p_pass): exams = 1; #begins at 1 to count pass while True: #we re-take exam until condition is meet, if we fail--> exams + 1 numb = np.random.rand() #print(numb) if numb < p_pass: #print("passed!") break else: exams +=1 return exams print(number_of_attempts(0.4))

N=10000 attempts= [] p=0.4 for i in range(N): #run N times and save in list attempts.append(number_of_attempts(p))

plt.hist(attempts, bins=20) plt.xlabel("Test score") plt.ylabel("Freq") plt.title("Histogram of avg number of attempts to Pass exam")

np.mean(attempts)