#DUTCH MEN TO US MEN#
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
# Input exact values for comparison
#
dutch_man_1925 = 174.83; us_man_1925 = 174.53
dutch_man_1955 = 180.23; us_man_1955 = 177.22
dutch_man_1995 = 182.54; us_man_1995 = 177.16
#
dutch_woman_1925 = 162.2; us_woman_1925 = 160.97
dutch_woman_1955 = 167.11; us_woman_1955 = 163.54
dutch_woman_1995 = 168.73; us_woman_1995 = 163.56
# Read in the data and plot; add axes labels, plot title and legend
# Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use
# the command plt.legend ()
#
filename = 'human_heights.txt'
dutch = np.loadtxt(filename, usecols=1, skiprows=2)
print (dutch)
us = np.loadtxt(filename, usecols=3, skiprows=2)
print (us)
plt.plot (dutch, 'ro')
plt.plot (us, 'bo')
plt.xlabel ("Year")
plt.ylabel ("Average Weight")
plt.title ("Weight of Dutch Men vs US Men")
plt.legend ("Dutch USA")
plt.show()
# Linear regression fit for both Dutch and U.S.; plot and print out the line
line_coeff = np.polyfit(dutch,us,1)
print (line_coeff)
print(f"The equation of the line using linear regression is {line_coeff[0]}x \
+{line_coeff[1]}")
plt.plot(dutch, us, 'ro')
plt.xlabel("Dutch Men")
plt.ylabel("US Men")
dmin=np.min(dutch); dmax = np.max(dutch)
xx=np.linspace (dmin, dmax )
f= np.poly1d(line_coeff)
yy=f(xx)
plt.plot(xx,yy,'b')
plt.title("Linear Regression fit to data")
plt.show()
# Calculate the variance for each fit; use the function that we wrote in a previous notebook
def calculate_variance ( x, y,coeff):
#
# Input: the x and y arrays for the data points, coefficients of line found using LR
#
# Output: variance
#
n=len(x)
degree = len(coeff) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coeff[0] * x[i] + coeff[1] # calculate value of y on line for given x[i]
else :
y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2]
y_data = y[i] # y height of data point at x[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
var = calculate_variance(dutch, us,line_coeff)
print(' The variance of the linear fit to data is',var)
#
# Input: the x and y arrays for the data points, coefficients of line found using LR
#
# Output: variance
#
# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas
parabola_coeff = np.polyfit(dutch,us,2)
print(f"The equation of the parabola using regression is {parabola_coeff[0]}x^2 \
+{parabola_coeff[1]}x + {parabola_coeff[2]}")
plt.plot(dutch, us, 'ro') # scatter plot of data
plt.xlabel("Dutch Men")
plt.ylabel("US Men")
#
# Create data to plot parabola
dmin=np.min(dutch); dmax = np.max(dutch)
xx=np.linspace (dmin, dmax )
g= np.poly1d(parabola_coeff)
yy=g(xx)
plt.plot(xx,yy,'b')
plt.title("Quadratic Regression fit to data")
plt.show()
# Calculate variance for the quadratic fits
var_quad = calculate_variance(dutch, us, parabola_coeff)
print("The variance for a quadratic fit to data is", var_quad)
# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error;
# round values to 2 decimal places
#
predicted_linear = f(0) # remember we set f = poly1d for linear case
predicted_quad = g(1) # remember we set g = poly1d for parabolic case
print(f"The predicted value at 1955 using the linear fit is {predicted_linear}\
and using a quadratic fit is {predicted_quad}")
print(f"The difference in the predicted values is {predicted_linear -predicted_quad}")