# Start writing code here...
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('houses.csv',header=None)
df[1].describe()
df[1].hist()
plt.title("Histogram of house prices")
plt.ylabel("Freq")
plt.xlabel("Cost")
plt.plot(df[1])
plt.title("House prices")
plt.xlabel("Index")
plt.ylabel("Cost")
df95 = df[df[1] < df[1].quantile(.95)] #use quantile to subset
df95.hist(bins=50)
plt.title("Histogram of 95 quantile of house prices")
plt.ylabel("Freq")
plt.xlabel("Cost")
insideLondon = df95[df95[13] == "GREATER LONDON"]
outsideLondon = df95[df95[13] != "GREATER LONDON"] # != not equal
import numpy as np
import matplotlib.pyplot as plt
x = insideLondon[1]
y = outsideLondon[1]
plt.hist([x, y], label=['Inside London', 'Outside London'], bins= 10) #plot both histograms in same plot
plt.legend(loc='upper right')
plt.title("Real estate compared inside and outside London")
plt.xlabel("Cost")
plt.ylabel("Freq")
plt.show()
plt.hist([x, y], label=['Inside London', 'Outside London'], bins= 10,density=True) # we add "densitiy=true"
plt.legend(loc='upper right')
plt.title("Real estate compared inside and outside London")
plt.xlabel("Cost")
plt.ylabel("Density")
titanic = pd.read_csv('titanic_train.csv')
titanic
from collections import Counter
#We will focus on
titanic[['Embarked','Pclass','Parch','Fare']]
#emarked
em = titanic['Embarked'] #subset data
emCount = Counter(em) #Import Counter function
#plt.bar(em)
print(emCount)
names = list(emCount.keys())
values = list(emCount.values())
valuesPMF = [x / len(em) for x in values]
plt.bar(names, valuesPMF)
plt.ylabel("f(x)"); plt.xlabel("Embarked Category")
#Pclass
pclass = titanic['Pclass'] #subset data
pclassCount = Counter(pclass) #Import Counter function
print(pclassCount)
names = list(pclassCount.keys())
values = list(pclassCount.values())
valuesPMF = [x / len(pclass) for x in values]
plt.bar(names, valuesPMF)
plt.ylabel("f(x)"); plt.xlabel("Passanger Class Category")
parch = titanic['Parch'] #subset data
parchCount = Counter(parch) #Import Counter function
print(parchCount)
names = list(parchCount.keys())
values = list(parchCount.values())
valuesPMF = [x / len(parch) for x in values]
plt.bar(names, valuesPMF)
plt.ylabel("f(x)"); plt.xlabel("Parch")
plt.title("estimated PMF of Parch")
#Fare = ticket price
fare = titanic['Fare'] #subset data
fare.hist(bins=100)
plt.xlabel("fare")
plt.ylabel("Freq")
# Make a histogram w/o using .hist()
parch = titanic['Parch'] #subset data
parchCount = Counter(parch) #Import Counter function
print(parchCount)
names = list(parchCount.keys())
values = list(parchCount.values())
valuesPMF = [x for x in values]
plt.bar(names, valuesPMF, width=1) #if we use Width = 1 we basicly get "histogram"
plt.ylabel("f(x)"); plt.xlabel("Parch")
plt.title("Histogram/barplot for Parch")
plt.hist(parch,bins=7)
fare = titanic['Fare'] #subset data
# quantization
maxq = np.max(fare)
minq = np.min(fare)
# bins
I = np.linspace(start=minq, stop=maxq, num=21) #To get same result as hist() with "bins=20" we need to have 21 points --> 20 bins
counterVal = np.zeros(21) #to fill with count
for i in range(len(I)-1): # loop for each intervall
for j in fare: #loop thoght all entries in titanic data
if I[i]<j<I[i+1]: #if in intervall, count
#print(j)
counterVal[i] =counterVal[i]+1
counterVal
plt.bar(I, counterVal, width=I[1]) #if we use Width = I[1] we basicly get histogram
plt.ylabel("Freq"); plt.xlabel("Fare")
plt.title("Histogram/barplot for Fare")
#np.histogram(fare,bins=20)
plt.hist(fare,bins=20)
plt.title("histogram using hist()")
plt.ylabel("Freq"); plt.xlabel("Fare")
len(fare) #we have 891 points, the 0,2 quantile will be at approx location 0.2*891 in a ordered array.
0.2*891
import math
sortedFare = sorted(fare) #sort fare to take our quantiles
l=len(fare)
p = 20 # 0.2 = 20%
q20 = sortedFare[int(math.ceil((l * p)/100)) - 1] # We sort array, then index for int closest to 0.2 quantile
q20 == np.quantile(fare,0.2) #True
assert(q20 == np.quantile(fare,0.2))
p = 50 #% = 0.5
q50 = sortedFare[int(math.ceil((l * p)/100)) - 1] # We sort array, then index for int closest to 0.5 quantile
q50 == np.quantile(fare,0.5) #True
assert(q50 == np.quantile(fare,0.5))
# Uniform
for N in [10,100,1000,100000]: #diffrent sizes to compare
x = np.random.rand(N)
plt.hist(x, bins = int(np.sqrt(N)))
plt.title('simulating n='+str(N)+' numbers from uniform distribution U(0,1)')
plt.ylabel("Freq")
plt.xlabel("x")
plt.show()
import matplotlib.pyplot as plt
import numpy as np
x = np.random.normal(0, 1, 100)
plt.hist(x, bins = int(np.sqrt(100)))
plt.title("simulating n=100 numbers from Normal Distribution N(0,1) ")
plt.ylabel("Freq")
plt.show()
x = np.random.normal(0, 1, 10000)
plt.hist(x, bins= int(np.sqrt(10000))) #rule of thumb for # of bins
plt.title("simulating n=10000 numbers from Normal Distribution N(0,1) ")
plt.ylabel("Freq")
plt.show()
#b)
def success(p_success):
numb = np.random.rand() #random number [0,1]
if numb < p_success: # if sicess return true else false
return True
else:
return False
def exam_score(p_correct, n_instances): # runs the function "successs() n times and counts number of right answers
correct = [];
for i in range(n_instances):
correct.append(success(p_correct))
return sum(correct)
exam_score(0.5,10)
# investigating dist
N=10000
results= []
p=0.8
n=20
for i in range(N): #run N times
results.append(exam_score(p,n))
plt.hist(results)
plt.xlabel("Test score")
plt.ylabel("Freq")
plt.title("Histogram of test score distribution")
#c)
#p_pass = 0.4
def number_of_attempts(p_pass):
exams = 1; #begins at 1 to count pass
while True: #we re-take exam until condition is meet, if we fail--> exams + 1
numb = np.random.rand()
#print(numb)
if numb < p_pass:
#print("passed!")
break
else:
exams +=1
return exams
print(number_of_attempts(0.4))
N=10000
attempts= []
p=0.4
for i in range(N): #run N times and save in list
attempts.append(number_of_attempts(p))
plt.hist(attempts, bins=20)
plt.xlabel("Test score")
plt.ylabel("Freq")
plt.title("Histogram of avg number of attempts to Pass exam")
np.mean(attempts)