# Import libraries
import numpy as np
import matplotlib.pyplot as plt
# Input exact values for comparison
#
dutch_man_1925 = 174.83; us_man_1925 = 174.53
dutch_man_1955 = 180.23; us_man_1955 = 177.22
dutch_man_1995 = 182.54; us_man_1995 = 177.16
#
# dutch_woman_1925 = 162.2; us_woman_1925 = 160.97
# dutch_woman_1955 = 167.11; us_woman_1955 = 163.54
# dutch_woman_1995 = 168.73; us_woman_1995 = 163.56
# Read in the data and plot; add axes labels, plot title and legend
# Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use
# the command plt.legend ()
#
dutchMen = np.loadtxt('human_heights.txt', usecols=1, skiprows=2)
usMen = np.loadtxt('human_heights.txt', usecols=3, skiprows=2)
years = np.loadtxt('human_heights.txt', usecols=0, skiprows=2)
plt.plot(years, dutchMen, label='Dutch Man')
plt.plot(years, usMen, label='US Men')
plt.xlabel('Years')
plt.ylabel('Height in cm')
plt.title('Average Heights of US and Dutch Men over the last century')
plt.legend()
# Linear regression fit for both Dutch and U.S.; plot and print out the line
#first for dutch men
dutchCoeff = np.polyfit(years,dutchMen,1)
f = np.poly1d(dutchCoeff)
yearFit = np.linspace(min(years),max(years),len(years))
#us men
usCoeff = np.polyfit(years,usMen,1)
g = np.poly1d(usCoeff)
#plot
plt.plot(years, dutchMen,'ro', label='Dutch Man')
plt.plot(years, usMen,'bo', label='US Men')
plt.plot(years,f(yearFit),label='Regression Line')
plt.plot(years,g(yearFit),label='Regression line for US')
plt.xlabel('Years')
plt.ylabel('Height in cm')
plt.title('Average Heights of US and Dutch Men over the last century')
plt.legend()
# Calculate the variance for each fit; use the function that we wrote in a previous notebook
#
# Input: the x and y arrays for the data points, coefficients of line found using LR
#
# Output: variance
#
def findVariance (x,y,coeff):
n = len(x)
degree = len(coeff)-1
var = 0.0
for i in range(0,n):
if (degree ==1):
yLine = (coeff[0] * x[i]) + coeff[1]
else:
yLine = (coeff[0]*x[i]*x[i]) + (coeff[1]*x[i]) + coeff[2]
yData = y[i]
distance = yData-yLine
var = (var+distance)*distance
var = (var)/float(n)
return (var)
varDutch = findVariance(years,dutchMen,dutchCoeff)
varUS = findVariance(years,usMen,usCoeff)
print (f'the dutch variance is {varDutch}')
print (f'the us variance is {varUS}')
# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas
#first for dutch men
dutchCoeffQ = np.polyfit(years,dutchMen,2)
h = np.poly1d(dutchCoeffQ)
yearFit = np.linspace(min(years),max(years),len(years))
#us men
usCoeffQ = np.polyfit(years,usMen,2)
k = np.poly1d(usCoeffQ)
#plot
plt.plot(years, dutchMen,'ro', label='Dutch Man')
plt.plot(years, usMen,'bo', label='US Men')
plt.plot(years,h(yearFit),label='Regression Line for Dutch')
plt.plot(years,k(yearFit),label='Regression line for US')
plt.xlabel('Years')
plt.ylabel('Height in cm')
plt.title('Average Heights of US and Dutch Men over the last century')
plt.legend()
# Calculate variance for the quadratic fits
varDutch = findVariance(years,dutchMen,dutchCoeffQ)
varUS = findVariance(years,usMen,usCoeffQ)
print (f'the dutch variance is {varDutch}')
print (f'the us variance is {varUS}')
# Use best fit to predict average heights in 1955 and 1995
#for both Dutch and U.S.; compute percent error;
# round values to 2 decimal places
#
print (f'the predicted Dutch height in 1955 is {round(h(1955),2)} and in 1995 is {round(h(1995),2)}')
print (f'the predicted us height in 1955 is {round(k(1955),2)} and in 1995 is {round(k(1995),2)}.')
def per_error(approx,exact):
error = ((approx-exact)/exact)*100
if (error<0):
error *= -1
roundError = round(error,2)
return roundError
derror1 = per_error(h(1955),dutch_man_1955)
derror2 = per_error(h(1995),dutch_man_1995)
print (f'dutch 1955 percent error:{derror1} 1995 error: {derror2}')
uerror1 = per_error(k(1955),us_man_1955)
uerror2 = per_error(k(1995),us_man_1995)
print (f'us 1955 percent error:{uerror1} 1995 error: {uerror2}')