# Import libraries
import numpy as np
import matplotlib.pyplot as plt
# Input exact values for comparison
#
dutch_man_1925 = 174.83; us_man_1925 = 174.53
dutch_man_1955 = 180.23; us_man_1955 = 177.22
dutch_man_1995 = 182.54; us_man_1995 = 177.16
#
dutch_woman_1925 = 162.2; us_woman_1925 = 160.97
dutch_woman_1955 = 167.11; us_woman_1955 = 163.54
dutch_woman_1995 = 168.73; us_woman_1995 = 163.56
# Read in the data and plot; add axes labels, plot title and legend
# Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use
# the command plt.legend ()
#
filename= 'human_heights.txt'
years=np.loadtxt(filename, skiprows=2, usecols=(0))
dutch_men = np.loadtxt(filename, skiprows=2, usecols=(1))
dutch_women= np.loadtxt(filename, skiprows=2, usecols=(2))
us_men = np.loadtxt(filename, skiprows=2, usecols=(3))
us_women= np.loadtxt(filename, skiprows=2, usecols=(4))
#women plot #girlbosses
plt.plot(years, dutch_women, 'mo', label= 'Dutch Women')
plt.plot (years, us_women, 'co', label= 'US Women')
plt.xlabel ('years')
plt.ylabel ('height in cm')
plt.title ('Height of Dutch Women vs Height of US Women')
plt.legend()
# Linear regression fit for both Dutch and U.S.; plot and print out the line
#US women
x= np.array([years])
y= np.array([us_women])
coeff1= np.polyfit(years,us_women,1)
print (coeff1)
print (f'the equation of the line that fits for US Women using linear regression is {round(coeff1[0],3)}x + {round(coeff1[1],3)}')
plt.plot(years, us_women, 'co')
plt.xlabel ('years')
plt.ylabel ('height in cm')
yrmin=np.min(years) ; yrmax= np.max(years)
xx=np.linspace (yrmin, yrmax)
f= np.poly1d(coeff1)
yy=f(xx)
plt.plot(xx,f(xx),'m')
plt.title ('Linear Regression fit to Data of US Women')
plt.show()
#Dutch women
x1= np.array([years])
y1= np.array([dutch_women])
coeff2 =np.polyfit(years,dutch_women,1)
print (coeff2)
print (f'the equation of the line that fits for Dutch Women using linear regression is {round(coeff2[0],3)}x + {round(coeff2[1],3)}')
plt.plot(years, dutch_women, 'co')
plt.xlabel ('years')
plt.ylabel ('height in cm')
yrmin=np.min(years) ; yrmax= np.max(years)
x1x=np.linspace (yrmin, yrmax)
f1= np.poly1d(coeff2)
y1y=f1(x1x)
plt.plot(x1x,y1y,'m')
plt.title ('Linear Regression fit to Data of Dutch Women')
plt.show()
# Calculate the variance for each fit; use the function that we wrote in a previous notebook
#variance for us women
def calculate_variance ( x, y,coeff):
n=len(x)
degree = len(coeff) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coeff[0] * x[i] + coeff[1]
else :
y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2]
y_data = y[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
var = calculate_variance(years, us_women,coeff1)
print (f' the variance for us women is', var)
#variance for dutch women
def calculate_variance ( x, y,coeff):
n=len(x)
degree = len(coeff) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coeff[0] * x[i] + coeff[1]
else :
y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2]
y_data = y[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
var = calculate_variance(years, dutch_women,coeff2)
print (f' the variance for dutch women is', var )
#
# Input: the x and y arrays for the data points, coefficients of line found using LR
#
# Output: variance
#
# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas
#us women
coeff1_quad = np.polyfit (years, us_women,2)
print (f' the equation of the parabola found using quadratic regression for US women is {coeff1_quad[0]}x^2 \
+{coeff1_quad[1]}x + {coeff1_quad[2]}')
plt.plot(years, us_women, 'co')
plt.xlabel ('years')
plt.ylabel ('height in cm')
yrmin=np.min(years) ; yrmax= np.max(years)
xx=np.linspace (yrmin, yrmax)
f= np.poly1d(coeff1_quad)
yy=f(xx)
plt.plot(xx,yy,'m')
plt.title ('Linear Regression fit to Data of US Women')
plt.show()
#dutch women
coeff2_quad = np.polyfit (years, dutch_women, 2)
print (f' the equation of the parabola found using quadratic regression for Dutch women is {coeff2_quad[0]}x^2 \
+{coeff2_quad[1]}x + {coeff2_quad[2]}')
plt.plot(years, dutch_women, 'co')
plt.xlabel ('years')
plt.ylabel ('height in cm')
yrmin=np.min(years) ; yrmax= np.max(years)
x1x=np.linspace (yrmin, yrmax)
f1= np.poly1d(coeff2_quad)
y1y=f1(x1x)
plt.plot(x1x,y1y,'m')
plt.title ('Linear Regression fit to Data of Dutch Women')
plt.show()
# Calculate variance for the quadratic fits
#variance for us women
def calculate_variance ( x, y,coeff):
n=len(x)
degree = len(coeff) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coeff[0] * x[i] + coeff[1]
else :
y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2]
y_data = y[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
var = calculate_variance(years, us_women,coeff1_quad)
print (f' the variance for us women is', var )
#variance for dutch women
def calculate_variance ( x, y,coeff):
n=len(x)
degree = len(coeff) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coeff[0] * x[i] + coeff[1]
else :
y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2]
y_data = y[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
var = calculate_variance(years, dutch_women,coeff2_quad)
print (f' the variance for dutch women is', var )
# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error;
# round values to 2 decimal places
#
#quadratic was chosen because visually the line fit the data better and the variances were smaller than with linear
#us womens quadratic
#-0.001 +x^2 + 3.755
print (f' predicted US Womens height in 1955 is {round(-0.0009496212121225872*1955*1955 +3.7551477272781053*1955 + -3548.37510606586242,2)}')
print (f' predicted US Womens height in 1995 is {round(-0.0009496212121225872*1995*1995 +3.7551477272781053*1995 + -3548.37510606586242,2)}')
#dutch women
print (f' predicted Dutch Womens height in 1955 is {round(-0.0018022727272749338*1955*1955 +7.159604545463183*1955 + -6941.453181826632,2)}')
print (f' predicted Dutch Womens height in 1995 is {round(-0.0018022727272749338*1995*1995 +7.159604545463183*1995 + -6941.453181826632,2)}')
#actual heights were dutch_woman_1955 = 167.11; us_woman_1955 = 163.54;
#dutch_woman_1995 = 168.73; us_woman_1995 = 163.56')
#percent error for US women
print (f' US womens 1955 percent error is {round((abs(163.46-163.54)/163.54)*100,2)}')
print (f' US womens 1995 percent error is {round((abs(163.66-163.56)/163.56)*100,2)}')
#percent error for dutch women
print (f' Dutch womens 1955 percent error is {round((abs(167.24-167.11)/167.11)*100,2)}')
print (f' Dutch womens 1995 percent error is {round((abs(168.87-168.73)/168.73)*100,2)}')
#i know there was a better way to go about this but i just like to complicate things