# Import libraries
import numpy as np
import matplotlib.pyplot as plt
# Input exact values for comparison
#
dutch_man_1925 = 174.83; us_man_1925 = 174.53
dutch_man_1955 = 180.23; us_man_1955 = 177.22
dutch_man_1995 = 182.54; us_man_1995 = 177.16
#
dutch_woman_1925 = 162.2; us_woman_1925 = 160.97
dutch_woman_1955 = 167.11; us_woman_1955 = 163.54
dutch_woman_1995 = 168.73; us_woman_1995 = 163.56
# Read in the data and plot; add axes labels, plot title and legend
# Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use
# the command plt.legend ()
#
years = np.loadtxt('human_heights.txt',skiprows=2,usecols=0)
dutch_men = np.loadtxt('human_heights.txt',skiprows=2,usecols=1)
us_men = np.loadtxt('human_heights.txt',skiprows=2,usecols=3)
plt.plot(years,dutch_men,'ro',label='Dutch Men')
plt.plot(years,us_men,'bo',label='U.S. Men')
plt.xlabel = "Years"
plt.ylabel = 'Height In CM'
plt.legend()
# Linear regression fit for both Dutch and U.S.; plot and print out the line
#Convert into arrays
year_array = np.array(years)
dutch_men_array = np.array(dutch_men)
us_men_array = np.array(us_men)
#calculate coefficients
dutch_men_coefficient = np.polyfit(year_array,dutch_men_array,1)
us_men_coefficient = np.polyfit(year_array,us_men_array,1)
#print out equations
print(f'The equation for dutch men is {round(dutch_men_coefficient[0],2)} * x + {round(dutch_men_coefficient[1],2)}')
print(f'The equation for U.S. men is {round(us_men_coefficient[0],2)} * x + {round(us_men_coefficient[1],2)}')
#write our function that evaluates the line at an array of x-values
def eval_line(coeff,x_eval):
return coeff[0] * x_eval + coeff[1]
#create our lines
dutch_men_line = eval_line(dutch_men_coefficient,year_array)
us_men_line = eval_line(us_men_coefficient,year_array)
#plot our lines
plt.plot(year_array,dutch_men_line,'r',label='Dutch Men Line')
plt.plot(year_array,us_men_line,'b',label='U.S. Men Line')
plt.xlabel = 'Years'
plt.ylabel = 'Height in CM'
plt.title('Dutch vs U.S. Men Height Regression')
plt.legend()
# Calculate the variance for each fit; use the function that we wrote in a previous notebook
#
# Input: the x and y arrays for the data points, coefficients of line found using LR
#
# Output: variance
#
def calculate_linear_variance(x,y,coeff):
var = 0.0
n=len(x)
for i in range(0,n):
y_line = coeff[0] * x[i] + coeff[1]
y_data= y[i]
distance = y_data-y_line
var = var + distance * distance
var = (var)/float(n)
return var
#call our function
dutch_men_linear_variance = calculate_linear_variance(year_array,dutch_men_array,dutch_men_coefficient)
us_men_linear_variance = calculate_linear_variance(year_array,us_men_array,us_men_coefficient)
print(f'The variance for dutch men is {dutch_men_linear_variance}')
print(f'The variance for U.S. men is {us_men_linear_variance}')
# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas
#calculate coefficients for quadratic
dutch_men_coefficient_quadratic = np.polyfit(year_array,dutch_men_array,2)
us_men_coefficient_quadratic = np.polyfit(year_array,us_men_array,2)
#print out equations and use our poly1d function on our fit
print(f'The equation for dutch men is {round(dutch_men_coefficient[0],2)} * x^2 + {round(dutch_men_coefficient[1],2)}')
print(f'The equation for U.S. men is {round(us_men_coefficient[0],2)} * x^2 + {round(us_men_coefficient[1],2)}')
one_d_dutch_men = np.poly1d(dutch_men_coefficient_quadratic)
one_d_us_men = np.poly1d(us_men_coefficient_quadratic)
#create our parabolas
dutch_men_parabola = one_d_dutch_men(year_array)
us_men_parabola = one_d_us_men(year_array)
plt.plot(year_array,dutch_men_parabola,'r',label='Dutch Men Parabola')
plt.plot(year_array,us_men_parabola,'b',label='U.S. Men Parabola')
plt.xlabel = 'Years'
plt.ylabel = 'Height in CM'
plt.title('Dutch Men vs U.S. Men Quadratic Regression')
plt.legend()
# Calculate variance for the quadratic fits
def calculate_quadratic_variance(x,y,coeff):
n=len(x)
degree = len(coeff) - 1
var = 0
for i in range(0,n):
if (degree == 1 ) :
y_line = coeff[0] * x[i] + coeff[1]
else :
y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2]
y_data = y[i]
distance = y_data-y_line
var = var + distance ** 2
var = ( var )/ float(n)
return (var)
dutch_men_quadratic_variance = calculate_quadratic_variance(year_array,dutch_men_array,dutch_men_coefficient_quadratic)
us_men_quadratic_variance = calculate_quadratic_variance(year_array,us_men_array,us_men_coefficient_quadratic)
print(f'The quadratic variance for dutch men is {dutch_men_quadratic_variance}')
print(f'The quadratic variance for U.S. men is {us_men_quadratic_variance}')
# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error;
# round values to 2 decimal places
#
#initialize our prediction years
prediction_years = np.array([1955,1995])
def calculate_percent_error(predicted,actual):
return abs((predicted-actual))/actual * 100
dutch_men_prediction = one_d_dutch_men(prediction_years);
dutch_men_1955 = round(dutch_men_prediction[0], 2)
dutch_men_1995 = round(dutch_men_prediction[1], 2)
dutch_percent_error_1955 = round(calculate_percent_error(dutch_men_1955, dutch_man_1955), 2)
dutch_percent_error_1995 = round(calculate_percent_error(dutch_men_1995, dutch_man_1995), 2)
print(f'The prediction for a dutch man in 1955 is {dutch_men_1955} with a percent error of {dutch_percent_error_1955}')
print(f'The prediction for a dutch man in 1995 is {dutch_men_1995} with a percent error of {dutch_percent_error_1995}')
us_men_prediction = one_d_us_men(prediction_years);
us_men_1955 = round(us_men_prediction[0], 2)
us_men_1995 = round(us_men_prediction[1], 2)
us_percent_error_1955 = round(calculate_percent_error(us_men_1955, us_man_1955), 2)
us_percent_error_1995 = round(calculate_percent_error(us_men_1995, us_man_1995), 2)
print('**********************************************************************')
print(f'The prediction for a U.S. man in 1955 is {us_men_1955} with a percent error of {us_percent_error_1955}')
print(f'The prediction for a U.S. man in 1995 is {us_men_1995} with a percent error of {us_percent_error_1995}')
#Our data does not have a linear relationship, and the linear model has a higher variance than the quadratic model. This higher variance hampers prediction.
#So the superior model is the quadratic model