# Import libraries
import numpy as np
import matplotlib.pyplot as plt
# Input exact values for comparison
#
dutch_man_1925 = 174.83; us_man_1925 = 174.53
dutch_man_1955 = 180.23; us_man_1955 = 177.22
dutch_man_1995 = 182.54; us_man_1995 = 177.16
#
dutch_woman_1925 = 162.2; us_woman_1925 = 160.97
dutch_woman_1955 = 167.11; us_woman_1955 = 163.54
dutch_woman_1995 = 168.73; us_woman_1995 = 163.56
# Read in the data and plot; add axes labels, plot title and legend
# Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use
# the command plt.legend ()
#
my_data = np.loadtxt('human_heights.txt', skiprows = 2)
years = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 0)
dutch_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 1)
us_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 3)
print (years)
print (dutch_men)
print (us_men)
plt.plot (years, dutch_men, 'bo', label = 'Dutch Men')
plt.plot (years, us_men, 'ro', label = 'US Men')
plt.title ("Dutch Men vs US Men Average Height")
plt.xlabel ("Time (years)")
plt.ylabel ("Average Height (cm)")
plt.legend ()
# Linear regression fit for both Dutch and U.S.; plot and print out the line
dutch_line_coeff = np.polyfit (years ,dutch_men,1)
print (f"equation of the line is for Dutch Men is = {dutch_line_coeff[0]}x + {dutch_line_coeff[1]}" )
f = np.poly1d(dutch_line_coeff)
us_line_coeff = np.polyfit (years ,us_men,1)
print (f"equation of the line is for US Men is = {us_line_coeff[0]}x + {us_line_coeff[1]}" )
f_1 = np.poly1d(us_line_coeff)
my_data = np.loadtxt('human_heights.txt', skiprows = 2)
years = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 0)
dutch_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 1)
us_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 3)
plt.plot (years, dutch_men, 'bo', label = 'Dutch Men')
plt.plot (years, us_men, 'ro', label = 'US Men')
plt.title ("Dutch Men vs US Men Average Height")
plt.xlabel ("Time (years)")
plt.ylabel ("Average Height (cm)")
plt.plot(years, f(years),'b')
plt.plot(years, f_1(years),'r')
plt.legend ()
# Calculate the variance for each fit; use the function that we wrote in a previous notebook
def calculate_variance ( x, y,coeff):
n=len(x)
degree = len(coeff) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coeff[0] * x[i] + coeff[1] # calculate value of y on line for given x[i]
else :
y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2]
y_data = y[i] # y height of data point at x[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
# Input: the x and y arrays for the data points, coefficients of line found using LR
var_dutch = calculate_variance_linear (years, dutch_men, dutch_line_coeff)
var_us = calculate_variance_linear (years, us_men, us_line_coeff)
# Output: variance
print (f"Dutch Variance = {var_dutch}")
print (f"US Variance {var_us}")
# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas
dutch_quad_coeff = np.polyfit (years ,dutch_men,2)
print (f"equation of the line is for Dutch Men is = {dutch_quad_coeff[0]}x + {dutch_quad_coeff[1]}" )
q_f = np.poly1d(dutch_quad_coeff)
us_quad_coeff = np.polyfit (years ,us_men,2)
print (f"equation of the line is for US Men is = {us_quad_coeff[0]}x + {us_quad_coeff[1]}" )
q_f_1 = np.poly1d(us_quad_coeff)
my_data = np.loadtxt('human_heights.txt', skiprows = 2)
years = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 0)
dutch_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 1)
us_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 3)
plt.plot (years, dutch_men, 'bo', label = 'Dutch Men')
plt.plot (years, us_men, 'ro', label = 'US Men')
plt.title ("Dutch Men vs US Men Average Height")
plt.xlabel ("Time (years)")
plt.ylabel ("Average Height (cm)")
plt.plot(years, q_f(years),'b')
plt.plot(years, q_f_1(years),'r')
plt.legend ()
# Calculate variance for the quadratic fits
def calculate_variance ( x, y,coeff):
n=len(x)
degree = len(coeff) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coeff[0] * x[i] + coeff[1] # calculate value of y on line for given x[i]
else :
y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2]
y_data = y[i] # y height of data point at x[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
quad_var_dutch = calculate_variance (years, dutch_men, dutch_quad_coeff)
quad_var_us = calculate_variance (years, us_men, us_quad_coeff)
print (f'Dutch Men Variance = {quad_var_dutch}')
print (f'US Men Variance = {quad_var_us}')
# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error;
# round values to 2 decimal places
#
dutch_man_1955 = 180.23; us_man_1955 = 177.22
dutch_man_1995 = 182.54; us_man_1995 = 177.16
def perc_error (actual, predicated):
error = ((actual - predicated) / actual) * 100
return (round (error,3))
x = 1955
y = 1995
x_pred_lin_dutch = f(x)
x_pred_quad_dutch = q_f(x)
x_pred_lin_us = f_1(x)
x_pred_quad_us = q_f_1(x)
y_pred_lin_dutch = f(y)
y_pred_quad_dutch = q_f(y)
y_pred_lin_us = f_1(y)
y_pred_quad_us = q_f_1(y)
print ("YEAR PERSON ACTUAL AVRG LIN_PRED %ERROR for LIN QUAD_PRED %ERROR for Quad")
print (f"1955 DUTCH 180.23 {x_pred_lin_dutch} {perc_error(180.23,x_pred_lin_dutch)} {x_pred_quad_dutch} {perc_error(180.23,x_pred_quad_dutch)}")
print (f"1955 US 177.22 {x_pred_lin_us} {perc_error(177.22,x_pred_lin_us)} {x_pred_quad_us} {perc_error(177.22,x_pred_quad_us)}")
print (f"1995 DUTCH 182.54 {y_pred_lin_dutch} {(perc_error(182.54 ,y_pred_lin_dutch))*-1} {y_pred_quad_dutch} {(perc_error(182.54 ,y_pred_quad_dutch))*-1}")
print (f"1995 US 177.16 {y_pred_lin_us} {(perc_error(177.16 ,y_pred_lin_us))*-1} {y_pred_quad_us} {perc_error(177.16 ,y_pred_quad_us)}")
The best one is the porabola because it has a lower percent error than the prediction by the linear regression model.