# Import libraries
import numpy as np
import matplotlib.pyplot as plt
# Input exact values for comparison
#
dutch_man_1925 = 174.83; us_man_1925 = 174.53
dutch_man_1955 = 180.23; us_man_1955 = 177.22
dutch_man_1995 = 182.54; us_man_1995 = 177.16
# Read in the data and plot; add axes labels, plot title and legend
# Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use
# the command plt.legend ()
#
years = np.loadtxt('human_heights.txt', skiprows=2, usecols=0)
dutch_men = np.loadtxt('human_heights.txt', skiprows=2, usecols=2)
us_men = np.loadtxt('human_heights.txt', skiprows=2, usecols=4)
plt.plot(years, dutch_men, 'ro', label="Dutch Men")
plt.plot(years, us_men, 'bo', label="US Men")
plt.xlabel = "Years"
plt.ylabel = "Height (cm)"
plt.title("US Men height vs Dutch Men height")
plt.legend()
# Linear regression fit for both Dutch and U.S.; plot and print out the line
y_array = np.array(years)
dutch_men_array = np.array(dutch_men)
us_men_array = np.array(us_men)
dutch_men_coefficient = np.polyfit(y_array, dutch_men_array, 1)
us_men_coefficient = np.polyfit(y_array, us_men_array, 1)
print(f"Dutch Men heights: y = {round(dutch_men_coefficient[0], 2)} * x + {round(dutch_men_coefficient[1], 2)}")
print(f"US Men heights: y = {round(us_men_coefficient[0], 2)} * x + {round(us_men_coefficient[1], 2)}")
def eval_line (coeff, x_eval) :
return (coeff[0] * x_eval + coeff[1])
dutch_men_line = eval_line(dutch_men_coefficient, y_array)
us_men_line = eval_line(us_men_coefficient, y_array)
plt.plot(y_array, dutch_men_line, 'r', label="Dutch Men")
plt.plot(y_array, us_men_line, 'b', label="US Men")
plt.xlabel = "Years"
plt.ylabel = "Height (cm)"
plt.title("US Men height vs Dutch Men height")
plt.legend()
# Calculate the variance for each fit; use the function that we wrote in a previous notebook
#
# Input: the x and y arrays for the data points, coefficients of line found using LR
#
# Output: variance
#
def linear_variance(x, y, coeff):
var = 0.
n = len(x)
for i in range(0,n) :
y_line = coeff[0] * x[i] + coeff[1]
y_data = y[i]
distance = y_data-y_line
var = var + distance **2
var = (var)/ float(n)
return (var)
dutch_men_linear_variance = linear_variance(years, dutch_men, dutch_men_coefficient)
us_men_linear_variance = linear_variance(years, us_men, us_men_coefficient)
print(f"Linear variance for US: {us_men_linear_variance}")
print(f"Linear variance for Dutch: {dutch_men_linear_variance}")
# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas
dutch_quad = np.polyfit(y_array, dutch_men_array, 2)
us_quad = np.polyfit(y_array, us_men_array, 2)
print(f"US men heights: y = {round(us_quad[0], 2)} * x^2 + {round(us_quad[1], 2)} * x + {round(us_quad[2], 2)}")
print(f"Dutch men heights: y = {round(dutch_quad[0], 2)} * x^2 + {round(dutch_quad[1], 2)} * x + {round(dutch_quad[2], 2)}")
dutch_f = np.poly1d(dutch_quad)
us_f = np.poly1d(us_quad)
dutch_parabola = dutch_f(y_array)
us_parabola = us_f(y_array)
plt.plot(y_array, dutch_parabola, 'r', label="Dutch men Quadratic")
plt.plot(y_array, us_parabola, 'b', label="US men Quadratic")
plt.xlabel = "Years"
plt.ylabel = "Height (cm)"
plt.title("US men height vs Dutch men height")
plt.legend()
# Calculate variance for the quadratic fits
def quadratic_variance (x, y, coeff):
n=len(x)
degree = len(coeff) -1
var = 0.
for i in range(0,n):
if (degree == 1):
y_line = coeff[0] * x[i] + coeff[1]
else :
y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2]
y_data = y[i]
distance = y_data-y_line
var = var + distance ** 2
var = (var)/ float(n)
return (var)
dutch_quadratic_variance = quadratic_variance(years, dutch_men, dutch_quad)
us_quadratic_variance = quadratic_variance(years, us_men, us_quad)
print(f"Quadratic variance for US: {us_quadratic_variance}")
print(f"Quadratic variance for Dutch: {dutch_quadratic_variance}")
# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error;
# round values to 2 decimal places
#
unknownyears = np.array([1955, 1995])
def percent_error(predicted, actual) :
return abs((predicted - actual))/actual * 100
us_estimate = us_f(unknownyears);
us_1955 = round(us_estimate[0], 2)
us_1995 = round(us_estimate[1], 2)
predicted_us_1955 = round(percent_error(us_1955, us_1955), 2)
predicted_us_1995 = round(percent_error(us_1995, us_1995), 2)
dutch_estimate = dutch_f(unknownyears);
dutch_1955 = round(dutch_estimate[0], 2)
dutch_1995 = round(dutch_estimate[1], 2)
predicted_dutch_1955 = round(percent_error(dutch_1955, dutch_1955), 2)
predicted_dutch_1995 = round(percent_error(dutch_1995, dutch_1995), 2)
print("1955:")
print(f"Predicted average height for US men: {us_1955}. percent error: {predicted_us_1955}%")
print(f"Predicted average height for Dutch men: {dutch_1955}. percent error: {predicted_dutch_1955}%")
print("1995:")
print(f"Predicted average height for US men: {us_1995}. percent error: {predicted_us_1995}%")
print(f"Predicted average height for Dutch men: {dutch_1995}. percent error: {predicted_dutch_1995}%")
Since the data is non linear quadratic regression is the best model to use.