import numpy as np
from numpy.random import binomial
from scipy.stats import binom
from math import factorial
import matplotlib.pyplot as plt
#definition of the binomial distribution
def my_binomial(k, n, p):
return factorial(n) / (factorial(k) * (factorial(n-k))) * pow(p, k) * pow(1-p, n-k)
print("My binomial: {}".format(my_binomial(2, 3, 0.5)))
dist = binom(3, 0.5)
dist.pmf(2)
print(7 / 8.0)
dist.cdf(2)
# simulation with 100 balanced coin tosses
# (run this cell multiple times to observe the variation in the results)
p = 0.5
n = 3
binomial(n, p)
# We are going to do an experiment generating a sample of sets of flips of 3 coins
arr = []
for _ in range(100):
arr.append(binomial(n, p))
simulated_distribution = np.unique(arr, return_counts=True)
simulated_distribution
def plot_hist(num_trials):
values = [0, 1, 2, 3]
arr = []
for _ in range(num_trials):
arr.append(binomial(3, 0.5))
simulated_distribution = np.unique(arr, return_counts=True)[1] / len(arr)
theorical_distribution = [binom(3, 0.5).pmf(k) for k in values]
plt.bar(values, theorical_distribution, label='theory', color='red')
plt.bar(values, simulated_distribution, label ='simulation', alpha=0.5, color='blue')
plt.title('Simulation with {} experiments'.format(num_trials))
plt.show()
plot_hist(20)
plot_hist(200)
plot_hist(20000)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
# Defining our gaussian distribution
def gaussian(x, mu, sigma):
return 1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-0.5 * pow((x - mu) / sigma, 2))
x = np.arange(-4, 4, 0.1)
y = gaussian(x, 0.0, 1.0)
plt.plot(x, y)
# using scipy
dist = norm(0, 1)
x = np.arange(-4, 4, 0.1)
y = [dist.pdf(value) for value in x]
plt.plot(x, y)
# Calculating the acumulated distribution
dist = norm(0, 1)
x = np.arange(-4, 4, 0.1)
y = [dist.cdf(value) for value in x]
plt.plot(x, y)
df = pd.read_excel('/work/s057.xls')
arr = df['Normally Distributed Housefly Wing Lengths'].values[3:]
values, dist = np.unique(arr, return_counts=True)
print(values)
plt.bar(values, dist)
# estimation of probability distribution
mu = arr.mean()
# theorical distribution
sigma = arr.std()
dist = norm(mu, sigma)
x = np.arange(33,58,0.1)
y = [dist.pdf(value) for value in x]
plt.plot(x, y)
# data
values, dist = np.unique(arr, return_counts=True)
plt.bar(values, dist/len(arr))
import numpy as np
from matplotlib import pyplot
from numpy.random import normal
from scipy.stats import norm
sample = normal(size=10_000) # random generator based on normal distribution
pyplot.hist(sample, bins=30)
pyplot.show()
sample = normal(loc=50, scale=5, size=10_000) # mu = 50, sigma = 5
mu = sample.mean()
sigma = sample.std()
dist = norm(mu, sigma)
values = [value for value in range(30, 70)]
probabilities = [dist.pdf(value) for value in values]
pyplot.hist(sample, bins=30, density=True)
pyplot.plot(values, probabilities)
pyplot.show()
from numpy import hstack
from sklearn.neighbors import KernelDensity
#we build a bimodal distribution
sample1 = normal(loc=20, scale=5, size=3_000) # mu = 20, sigma = 5
sample2 = normal(loc=40, scale=5, size=7_000) # mu = 40, sigma = 5
sample = hstack((sample1, sample2))
model = KernelDensity(bandwidth=2, kernel='gaussian') # bandwidth = smoothing parameter
sample = sample.reshape((len(sample), 1))
model.fit(sample)
values = np.asarray([value for value in range(1, 60)])
values = values.reshape((len(values), 1))
probabilities = model.score_samples(values) # logarithmic probability
probabilities = np.exp(probabilities) # probability inversion
pyplot.hist(sample, bins=50, density=True)
pyplot.plot(values, probabilities)
pyplot.show()
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
import seaborn as sns
def likelihood(y, yp):
return yp * y + (1-yp) * (1-y)
fig = plt.figure()
ax = fig.gca(projection='3d')
Y = np.arange(0, 1, 0.01)
YP = np.arange(0, 1, 0.01)
Y, YP = np.meshgrid(Y, YP)
Z = likelihood(Y, YP)
surf = ax.plot_surface(Y, YP, Z, cmap=cm.coolwarm, linewidth=0, antialiased=False)
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
atrib_names = ['sepal length', 'sepal width', 'petal length', 'petal width']
X, y = load_iris(return_X_y=True)
X[:2]
y[:100]
clf = LogisticRegression(random_state=10, solver='liblinear').fit(X[:100], y[:100])
clf.coef_
model_coefs = pd.DataFrame(clf.coef_, columns=atrib_names)
model_coefs