Probability for Data Science

import numpy as np from numpy.random import binomial from scipy.stats import binom from math import factorial import matplotlib.pyplot as plt

#definition of the binomial distribution def my_binomial(k, n, p): return factorial(n) / (factorial(k) * (factorial(n-k))) * pow(p, k) * pow(1-p, n-k)

print("My binomial: {}".format(my_binomial(2, 3, 0.5)))

dist = binom(3, 0.5) dist.pmf(2)

print(7 / 8.0) dist.cdf(2)

# simulation with 100 balanced coin tosses # (run this cell multiple times to observe the variation in the results) p = 0.5 n = 3 binomial(n, p)

# We are going to do an experiment generating a sample of sets of flips of 3 coins arr = [] for _ in range(100): arr.append(binomial(n, p)) simulated_distribution = np.unique(arr, return_counts=True) simulated_distribution

def plot_hist(num_trials): values = [0, 1, 2, 3] arr = [] for _ in range(num_trials): arr.append(binomial(3, 0.5)) simulated_distribution = np.unique(arr, return_counts=True)[1] / len(arr) theorical_distribution = [binom(3, 0.5).pmf(k) for k in values] plt.bar(values, theorical_distribution, label='theory', color='red') plt.bar(values, simulated_distribution, label ='simulation', alpha=0.5, color='blue') plt.title('Simulation with {} experiments'.format(num_trials)) plt.show() plot_hist(20) plot_hist(200) plot_hist(20000)

import pandas as pd import numpy as np import matplotlib.pyplot as plt from scipy.stats import norm

# Defining our gaussian distribution def gaussian(x, mu, sigma): return 1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-0.5 * pow((x - mu) / sigma, 2))

x = np.arange(-4, 4, 0.1) y = gaussian(x, 0.0, 1.0) plt.plot(x, y)

# using scipy dist = norm(0, 1) x = np.arange(-4, 4, 0.1) y = [dist.pdf(value) for value in x] plt.plot(x, y)

# Calculating the acumulated distribution dist = norm(0, 1) x = np.arange(-4, 4, 0.1) y = [dist.cdf(value) for value in x] plt.plot(x, y)

df = pd.read_excel('/work/s057.xls') arr = df['Normally Distributed Housefly Wing Lengths'].values[3:] values, dist = np.unique(arr, return_counts=True) print(values) plt.bar(values, dist)

# estimation of probability distribution mu = arr.mean() # theorical distribution sigma = arr.std() dist = norm(mu, sigma) x = np.arange(33,58,0.1) y = [dist.pdf(value) for value in x] plt.plot(x, y) # data values, dist = np.unique(arr, return_counts=True) plt.bar(values, dist/len(arr))

import numpy as np from matplotlib import pyplot from numpy.random import normal from scipy.stats import norm

sample = normal(size=10_000) # random generator based on normal distribution pyplot.hist(sample, bins=30) pyplot.show()

sample = normal(loc=50, scale=5, size=10_000) # mu = 50, sigma = 5 mu = sample.mean() sigma = sample.std() dist = norm(mu, sigma) values = [value for value in range(30, 70)] probabilities = [dist.pdf(value) for value in values] pyplot.hist(sample, bins=30, density=True) pyplot.plot(values, probabilities) pyplot.show()

from numpy import hstack from sklearn.neighbors import KernelDensity #we build a bimodal distribution sample1 = normal(loc=20, scale=5, size=3_000) # mu = 20, sigma = 5 sample2 = normal(loc=40, scale=5, size=7_000) # mu = 40, sigma = 5 sample = hstack((sample1, sample2)) model = KernelDensity(bandwidth=2, kernel='gaussian') # bandwidth = smoothing parameter sample = sample.reshape((len(sample), 1)) model.fit(sample) values = np.asarray([value for value in range(1, 60)]) values = values.reshape((len(values), 1)) probabilities = model.score_samples(values) # logarithmic probability probabilities = np.exp(probabilities) # probability inversion pyplot.hist(sample, bins=50, density=True) pyplot.plot(values, probabilities) pyplot.show()

from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt from matplotlib import cm import numpy as np import pandas as pd import seaborn as sns

def likelihood(y, yp): return yp * y + (1-yp) * (1-y) fig = plt.figure() ax = fig.gca(projection='3d') Y = np.arange(0, 1, 0.01) YP = np.arange(0, 1, 0.01) Y, YP = np.meshgrid(Y, YP) Z = likelihood(Y, YP) surf = ax.plot_surface(Y, YP, Z, cmap=cm.coolwarm, linewidth=0, antialiased=False) fig.colorbar(surf, shrink=0.5, aspect=5) plt.show()

from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression atrib_names = ['sepal length', 'sepal width', 'petal length', 'petal width'] X, y = load_iris(return_X_y=True)

X[:2]

y[:100]

clf = LogisticRegression(random_state=10, solver='liblinear').fit(X[:100], y[:100])

clf.coef_

model_coefs = pd.DataFrame(clf.coef_, columns=atrib_names) model_coefs