%%writefile distributions/general.py
class Distribution:
def __init__(self, mu=0, sigma=1):
"""
Generic distribution class for calculating and
visualizing a probability distribution.
Attributes:
mean (float) representing the mean value of the distribution
stdev (float) representing the standard deviation of the distribution
data_list (list of floats) a list of floats extracted from the data file
"""
self.mean = mu
self.stdev = sigma
self.data = []
def read_data_file(self, file_name):
"""
Function to read in data from a txt file. The txt file should have
one number (float) per line. The numbers are stored in the data attribute.
Args:
file_name (string): name of a file to read from
Returns:
None
"""
with open(file_name) as file:
data_list = []
line = file.readline()
while line:
data_list.append(int(line))
line = file.readline()
file.close()
self.data = data_list
from distributions.general import Distribution # importing python file from directory
general1 = Distribution(20, 3) # creating object with 20 mean, 30 standard deviation
print("Mean: {}, standard deviation: {}".format(general1.mean, general1.stdev))
general1.read_data_file("data/random.txt") # loading txt file data and checking
print("Data Loaded :", general1.data)
## everything is working perfectly
%%writefile distributions/gaussian.py
import math
import matplotlib.pyplot as plt
from .general import Distribution
class Gaussian(Distribution):
"""
Gaussian distribution class for calculating and
visualizing a Gaussian distribution.
Attributes:
mean (float) representing the mean value of the distribution
stdev (float) representing the standard deviation of the distribution
data_list (list of floats) a list of floats extracted from the data file
"""
def __init__(self, mu=0, sigma=1):
Distribution.__init__(self, mu, sigma)
def calculate_mean(self):
"""
Function to calculate the mean of the data set.
Args:
None
Returns:
float: mean of the data set
"""
avg = 1.0 * sum(self.data) / len(self.data)
self.mean = avg
return self.mean
def calculate_stdev(self, sample=True):
"""
Function to calculate the standard deviation of the data set.
Args:
sample (bool): whether the data represents a sample or population
Returns:
float: standard deviation of the data set
"""
if sample:
n = len(self.data) - 1
else:
n = len(self.data)
mean = self.calculate_mean()
sigma = 0
for d in self.data:
sigma += (d - mean) ** 2
sigma = math.sqrt(sigma / n)
self.stdev = sigma
return self.stdev
def plot_histogram(self):
"""
Function to output a histogram of the instance variable data using
matplotlib pyplot library.
Args:
None
Returns:
None
"""
plt.hist(self.data)
plt.title('Histogram of Data')
plt.xlabel('data')
plt.ylabel('count')
def pdf(self, x):
"""
Probability density function calculator for the gaussian distribution.
Args:
x (float): point for calculating the probability density function
Returns:
float: probability density function output
"""
return (1.0 / (self.stdev * math.sqrt(2*math.pi))) * math.exp(-0.5*((x - self.mean) / self.stdev) ** 2)
def plot_histogram_pdf(self, n_spaces = 50):
"""
Function to plot the normalized histogram of the data and a plot of the
probability density function along the same range
Args:
n_spaces (int): number of data points
Returns:
list: x values for the pdf plot
list: y values for the pdf plot
"""
mu = self.mean
sigma = self.stdev
min_range = min(self.data)
max_range = max(self.data)
# calculates the interval between x values
interval = 1.0 * (max_range - min_range) / n_spaces
x = []
y = []
# calculate the x values to visualize
for i in range(n_spaces):
tmp = min_range + interval*i
x.append(tmp)
y.append(self.pdf(tmp))
# make the plots
fig, axes = plt.subplots(2,sharex=True)
fig.subplots_adjust(hspace=.5)
axes[0].hist(self.data, density=True)
axes[0].set_title('Normed Histogram of Data')
axes[0].set_ylabel('Density')
axes[1].plot(x, y)
axes[1].set_title('Normal Distribution for \n Sample Mean and Sample Standard Deviation')
axes[0].set_ylabel('Density')
plt.show()
return x, y
def __add__(self, other):
"""
Function to add together two Gaussian distributions
Args:
other (Gaussian): Gaussian instance
Returns:
Gaussian: Gaussian distribution
"""
result = Gaussian()
result.mean = self.mean + other.mean
result.stdev = math.sqrt(self.stdev ** 2 + other.stdev ** 2)
return result
def __repr__(self):
"""
Function to output the characteristics of the Gaussian instance
Args:
None
Returns:
string: characteristics of the Gaussian
"""
return "mean {}, standard deviation {}".format(self.mean, self.stdev)
from distributions.gaussian import Gaussian
Gaussian(20,3) ## checking the object magic function
gaussian1 = Gaussian(25, 2)
gaussian1.read_data_file('data/random.txt')
print("Before pdf : ", round(gaussian1.pdf(25), 5)) # checking pdf on initial mean and stdev
print("mean :", round(gaussian1.calculate_mean(), 2)) # calculating mean of data
print("Stdev :", round(gaussian1.calculate_stdev(), 2)) # calculating stdev of data
print("After pdf : ", round(gaussian1.pdf(75), 5)) # calcuation pdf of data
gaussian1.plot_histogram_pdf();
%%writefile test/test_gaussian.py
import unittest
from distributions.gaussian import Gaussian
class TestGaussianClass(unittest.TestCase):
def setUp(self):
self.gaussian = Gaussian(35, 6)
self.gaussian.read_data_file('data/random.txt')
def test_initialization(self):
self.assertEqual(self.gaussian.mean, 35, 'incorrect mean')
self.assertEqual(self.gaussian.stdev, 6, 'incorrect standard deviation')
def test_readdata(self):
self.assertEqual(self.gaussian.data,\
[10, 23, 45, 12, 23, 45, 67, 100, 300, 250, 45, 68, 29, 59, 239, 934, 12, 321, 12, 32, 1], 'data not read in correctly')
def test_meancalculation(self):
self.assertEqual(self.gaussian.calculate_mean(),\
sum(self.gaussian.data) / float(len(self.gaussian.data)), 'calculated mean not as expected')
def test_stdevcalculation(self):
self.assertEqual(round(self.gaussian.calculate_stdev(), 2), 210.77, 'sample standard deviation incorrect')
self.assertEqual(round(self.gaussian.calculate_stdev(0), 2), 205.69, 'population standard deviation incorrect')
def test_pdf(self):
self.assertEqual(round(self.gaussian.pdf(25), 5), 0.01658,\
'pdf function does not give expected result')
self.gaussian.calculate_mean()
self.gaussian.calculate_stdev()
self.assertEqual(round(self.gaussian.pdf(75), 5), 0.00184,\
'pdf function after calculating mean and stdev does not give expected result')
def test_add(self):
gaussian_one = Gaussian(25, 3)
gaussian_two = Gaussian(30, 4)
gaussian_sum = gaussian_one + gaussian_two
self.assertEqual(gaussian_sum.mean, 55)
self.assertEqual(gaussian_sum.stdev, 5)
if __name__ == '__main__':
unittest.main()
!python test/test_gaussian.py
%%writefile distributions/binomial.py
import math
import matplotlib.pyplot as plt
from .general import Distribution
class Binomial(Distribution):
""" Binomial distribution class for calculating and
visualizing a Binomial distribution.
Attributes:
mean (float) representing the mean value of the distribution
stdev (float) representing the standard deviation of the distribution
data_list (list of floats) a list of floats to be extracted from the data file
p (float) representing the probability of an event occurring
n (int) the total number of trials
"""
def __init__(self, prob=.5, size=20):
self.p = prob
self.n = size
mu = self.calculate_mean()
sigma = self.calculate_stdev()
Distribution.__init__(self, mu, sigma)
def calculate_mean(self):
"""Function to calculate the mean from p and n
Args:
None
Returns:
float: mean of the data set
"""
self.mean = self.p*self.n
return self.mean
def calculate_stdev(self):
"""Function to calculate the standard deviation from p and n.
Args:
None
Returns:
float: standard deviation of the data set
"""
self.stdev = math.sqrt(self.n * self.p * (1 - self.p))
return self.stdev
def replace_stats_with_data(self):
"""Function to calculate p and n from the data set
Args:
None
Returns:
float: the p value
float: the n value
"""
self.n = len(self.data)
self.p = sum(self.data)/len(self.data)
self.mean = self.calculate_mean()
self.stdev = self.calculate_stdev()
return self.p,self.n
def plot_bar(self):
"""Function to output a histogram of the instance variable data using
matplotlib pyplot library.
Args:
None
Returns:
None
"""
plt.bar(x = ['0', '1'], height = [(1 - self.p) * self.n, self.p * self.n])
plt.title('Bar Chart of Data')
plt.xlabel('outcome')
plt.ylabel('count')
def pdf(self, k):
"""Probability density function calculator for the gaussian distribution.
Args:
k (float): point for calculating the probability density function
Returns:
float: probability density function output
"""
a = math.factorial(self.n) / (math.factorial(k) * (math.factorial(self.n - k)))
b = (self.p ** k) * (1 - self.p) ** (self.n - k)
return a * b
def plot_bar_pdf(self):
"""Function to plot the pdf of the binomial distribution
Args:
None
Returns:
list: x values for the pdf plot
list: y values for the pdf plot
"""
x = []
y = []
# calculate the x values to visualize
for i in range(self.n + 1):
x.append(i)
y.append(self.pdf(i))
# make the plots
plt.bar(x, y)
plt.title('Distribution of Outcomes')
plt.ylabel('Probability')
plt.xlabel('Outcome')
plt.show()
return x, y
def __add__(self, other):
"""Function to add together two Binomial distributions with equal p
Args:
other (Binomial): Binomial instance
Returns:
Binomial: Binomial distribution
"""
try:
assert self.p == other.p, 'p values are not equal'
except AssertionError as error:
raise
result = Binomial()
result.p = self.p
result.n = self.n + other.n
result.mean = self.calculate_mean()
result.stdev = self.calculate_stdev()
return result
pass
def __repr__(self):
"""Function to output the characteristics of the Binomial instance
Args:
None
Returns:
string: characteristics of the Gaussian
"""
return "mean {}, standard deviation {}, p {}, n {}".format(self.mean,self.stdev,self.p,self.n)
pass
from distributions.binomial import Binomial
Binomial(0.6,300)
B1 = Binomial(0.4, 20) # creating object with 0.4 probability, 30 size
print("Mean: {}, standard deviation: {}".format(B1.mean, round(B1.stdev,2)))
B1.read_data_file("data/binary.txt") # loading txt file data and checking
print("Data Loaded :", B1.data)
print("Initial PDF",round(B1.pdf(5), 5))
p, n = B1.replace_stats_with_data()
print("Data p: {}, n: {}".format(round(p,3),n))
print("Data PDF",round(B1.pdf(5), 5))
B1.plot_bar()
B1.plot_bar_pdf();
%%writefile test/test_binomial.py
import unittest
from distributions.binomial import Binomial
class TestBinomialClass(unittest.TestCase):
def setUp(self):
self.binomial = Binomial(0.4, 20)
self.binomial.read_data_file('data/binary.txt')
def test_initialization(self):
self.assertEqual(self.binomial.p, 0.4, 'p value incorrect')
self.assertEqual(self.binomial.n, 20, 'n value incorrect')
def test_readdata(self):
self.assertEqual(self.binomial.data,\
[1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1], 'data not read in correctly')
def test_calculatemean(self):
mean = self.binomial.calculate_mean()
self.assertEqual(mean, 8)
def test_calculatestdev(self):
stdev = self.binomial.calculate_stdev()
self.assertEqual(round(stdev,2), 2.19)
def test_replace_stats_with_data(self):
p, n = self.binomial.replace_stats_with_data()
self.assertEqual(round(p,3), 0.471)
self.assertEqual(n, 17)
def test_pdf(self):
self.assertEqual(round(self.binomial.pdf(5), 5), 0.07465)
self.assertEqual(round(self.binomial.pdf(3), 5), 0.01235)
self.binomial.replace_stats_with_data()
self.assertEqual(round(self.binomial.pdf(5), 5), 0.06923)
self.assertEqual(round(self.binomial.pdf(3), 5), 0.00963)
def test_add(self):
binomial_one = Binomial(.4, 20)
binomial_two = Binomial(.4, 60)
binomial_sum = binomial_one + binomial_two
self.assertEqual(binomial_sum.p, .4)
self.assertEqual(binomial_sum.n, 80)
if __name__ == '__main__':
unittest.main()
!python test/test_binomial.py
%%writefile distributions/__init__.py
from .binomial import Binomial
from .gaussian import Gaussian
%%writefile setup.py
from setuptools import setup
setup(name='distributions',
version='0.2',
description='Gaussian and Binomial distributions',
packages=['distributions'],
author = "Abid Ali Awan",
author_email = "abidaliawan@rocketmail.com",
zip_safe=False)