import numpy as np
from numpy.random import binomial
from scipy.stats import binom
from math import factorial
import matplotlib.pyplot as plt
#definition of the binomial distribution
def my_binomial(k, n, p):
return factorial(n) / (factorial(k) * (factorial(n-k))) * pow(p, k) * pow(1-p, n-k)
print("My binomial: {}".format(my_binomial(2, 3, 0.5)))
My binomial: 0.375
dist = binom(3, 0.5)
dist.pmf(2)
print(7 / 8.0)
dist.cdf(2)
0.875
# simulation with 100 balanced coin tosses
# (run this cell multiple times to observe the variation in the results)
p = 0.5
n = 3
binomial(n, p)
# We are going to do an experiment generating a sample of sets of flips of 3 coins
arr = []
for _ in range(100):
arr.append(binomial(n, p))
simulated_distribution = np.unique(arr, return_counts=True)
simulated_distribution
def plot_hist(num_trials):
values = [0, 1, 2, 3]
arr = []
for _ in range(num_trials):
arr.append(binomial(3, 0.5))
simulated_distribution = np.unique(arr, return_counts=True)[1] / len(arr)
theorical_distribution = [binom(3, 0.5).pmf(k) for k in values]
plt.bar(values, theorical_distribution, label='theory', color='red')
plt.bar(values, simulated_distribution, label ='simulation', alpha=0.5, color='blue')
plt.title('Simulation with {} experiments'.format(num_trials))
plt.show()
plot_hist(20)
plot_hist(200)
plot_hist(20000)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
# Defining our gaussian distribution
def gaussian(x, mu, sigma):
return 1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-0.5 * pow((x - mu) / sigma, 2))
x = np.arange(-4, 4, 0.1)
y = gaussian(x, 0.0, 1.0)
plt.plot(x, y)
# using scipy
dist = norm(0, 1)
x = np.arange(-4, 4, 0.1)
y = [dist.pdf(value) for value in x]
plt.plot(x, y)
# Calculating the acumulated distribution
dist = norm(0, 1)
x = np.arange(-4, 4, 0.1)
y = [dist.cdf(value) for value in x]
plt.plot(x, y)
df = pd.read_excel('/work/s057.xls')
arr = df['Normally Distributed Housefly Wing Lengths'].values[3:]
values, dist = np.unique(arr, return_counts=True)
print(values)
plt.bar(values, dist)
[36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55]
# estimation of probability distribution
mu = arr.mean()
# theorical distribution
sigma = arr.std()
dist = norm(mu, sigma)
x = np.arange(33,58,0.1)
y = [dist.pdf(value) for value in x]
plt.plot(x, y)
# data
values, dist = np.unique(arr, return_counts=True)
plt.bar(values, dist/len(arr))
import numpy as np
from matplotlib import pyplot
from numpy.random import normal
from scipy.stats import norm
sample = normal(size=10_000) # random generator based on normal distribution
pyplot.hist(sample, bins=30)
pyplot.show()
sample = normal(loc=50, scale=5, size=10_000) # mu = 50, sigma = 5
mu = sample.mean()
sigma = sample.std()
dist = norm(mu, sigma)
values = [value for value in range(30, 70)]
probabilities = [dist.pdf(value) for value in values]
pyplot.hist(sample, bins=30, density=True)
pyplot.plot(values, probabilities)
pyplot.show()
from numpy import hstack
from sklearn.neighbors import KernelDensity
#we build a bimodal distribution
sample1 = normal(loc=20, scale=5, size=3_000) # mu = 20, sigma = 5
sample2 = normal(loc=40, scale=5, size=7_000) # mu = 40, sigma = 5
sample = hstack((sample1, sample2))
model = KernelDensity(bandwidth=2, kernel='gaussian') # bandwidth = smoothing parameter
sample = sample.reshape((len(sample), 1))
model.fit(sample)
values = np.asarray([value for value in range(1, 60)])
values = values.reshape((len(values), 1))
probabilities = model.score_samples(values) # logarithmic probability
probabilities = np.exp(probabilities) # probability inversion
pyplot.hist(sample, bins=50, density=True)
pyplot.plot(values, probabilities)
pyplot.show()
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
import seaborn as sns
def likelihood(y, yp):
return yp * y + (1-yp) * (1-y)
fig = plt.figure()
ax = fig.gca(projection='3d')
Y = np.arange(0, 1, 0.01)
YP = np.arange(0, 1, 0.01)
Y, YP = np.meshgrid(Y, YP)
Z = likelihood(Y, YP)
surf = ax.plot_surface(Y, YP, Z, cmap=cm.coolwarm, linewidth=0, antialiased=False)
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:5: MatplotlibDeprecationWarning: Calling gca() with keyword arguments was deprecated in Matplotlib 3.4. Starting two minor releases later, gca() will take no keyword arguments. The gca() function should only be used to get the current axes, or if no axes exist, create new axes with default keyword arguments. To create a new axes with non-default arguments, use plt.axes() or plt.subplot().
"""
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
atrib_names = ['sepal length', 'sepal width', 'petal length', 'petal width']
X, y = load_iris(return_X_y=True)
X[:2]
y[:100]
clf = LogisticRegression(random_state=10, solver='liblinear').fit(X[:100], y[:100])
clf.coef_
model_coefs = pd.DataFrame(clf.coef_, columns=atrib_names)
model_coefs