# Import libraries
import numpy as np
import matplotlib.pyplot as plt
# Input exact values for comparison
#
dutch_man_1925 = 174.83; us_man_1925 = 174.53
dutch_man_1955 = 180.23; us_man_1955 = 177.22
dutch_man_1995 = 182.54; us_man_1995 = 177.16
#
dutch_woman_1925 = 162.2; us_woman_1925 = 160.97
dutch_woman_1955 = 167.11; us_woman_1955 = 163.54
dutch_woman_1995 = 168.73; us_woman_1995 = 163.56
# Read in the data and plot; add axes labels, plot title and legend
# Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use
# the command plt.legend ()
#
my_data = np.loadtxt('human_heights.txt', skiprows = 2)
years = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 0)
dutch_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 1)
us_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 3)
print (years)
print (dutch_men)
print (us_men)
plt.plot (years, dutch_men, 'bo', label = 'Dutch Men')
plt.plot (years, us_men, 'ro', label = 'US Men')
plt.title ("Dutch Men vs US Men Average Height")
plt.xlabel ("Time (years)")
plt.ylabel ("Average Height (cm)")
plt.legend ()
[1900. 1910. 1920. 1930. 1940. 1950. 1960. 1970. 1980. 1990.]
[170.14 171.95 173.95 175.7 177.51 179.26 181.11 182.18 182.55 182.55]
[171.48 172.52 173.82 175.25 176.4 177.06 177.29 177.4 177.53 177.3 ]
# Linear regression fit for both Dutch and U.S.; plot and print out the line
dutch_line_coeff = np.polyfit (years ,dutch_men,1)
print (f"equation of the line is for Dutch Men is = {dutch_line_coeff[0]}x + {dutch_line_coeff[1]}" )
f = np.poly1d(dutch_line_coeff)
us_line_coeff = np.polyfit (years ,us_men,1)
print (f"equation of the line is for US Men is = {us_line_coeff[0]}x + {us_line_coeff[1]}" )
f_1 = np.poly1d(us_line_coeff)
my_data = np.loadtxt('human_heights.txt', skiprows = 2)
years = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 0)
dutch_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 1)
us_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 3)
plt.plot (years, dutch_men, 'bo', label = 'Dutch Men')
plt.plot (years, us_men, 'ro', label = 'US Men')
plt.title ("Dutch Men vs US Men Average Height")
plt.xlabel ("Time (years)")
plt.ylabel ("Average Height (cm)")
plt.plot(years, f(years),'b')
plt.plot(years, f_1(years),'r')
plt.legend ()
equation of the line is for Dutch Men is = 0.14849696969696974x + -111.13660606060607
equation of the line is for US Men is = 0.06795757575757586x + 43.42751515151494
# Calculate the variance for each fit; use the function that we wrote in a previous notebook
def calculate_variance ( x, y,coeff):
n=len(x)
degree = len(coeff) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coeff[0] * x[i] + coeff[1] # calculate value of y on line for given x[i]
else :
y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2]
y_data = y[i] # y height of data point at x[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
# Input: the x and y arrays for the data points, coefficients of line found using LR
var_dutch = calculate_variance_linear (years, dutch_men, dutch_line_coeff)
var_us = calculate_variance_linear (years, us_men, us_line_coeff)
# Output: variance
print (f"Dutch Variance = {var_dutch}")
print (f"US Variance {var_us}")
Dutch Variance = 0.7567162424242403
US Variance 0.7133635151515108
# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas
dutch_quad_coeff = np.polyfit (years ,dutch_men,2)
print (f"equation of the line is for Dutch Men is = {dutch_quad_coeff[0]}x + {dutch_quad_coeff[1]}" )
q_f = np.poly1d(dutch_quad_coeff)
us_quad_coeff = np.polyfit (years ,us_men,2)
print (f"equation of the line is for US Men is = {us_quad_coeff[0]}x + {us_quad_coeff[1]}" )
q_f_1 = np.poly1d(us_quad_coeff)
my_data = np.loadtxt('human_heights.txt', skiprows = 2)
years = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 0)
dutch_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 1)
us_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 3)
plt.plot (years, dutch_men, 'bo', label = 'Dutch Men')
plt.plot (years, us_men, 'ro', label = 'US Men')
plt.title ("Dutch Men vs US Men Average Height")
plt.xlabel ("Time (years)")
plt.ylabel ("Average Height (cm)")
plt.plot(years, q_f(years),'b')
plt.plot(years, q_f_1(years),'r')
plt.legend ()
equation of the line is for Dutch Men is = -0.0010795454545465886x + 4.3479287878832285
equation of the line is for US Men is = -0.0011325757575770677x + 4.4736772727324
# Calculate variance for the quadratic fits
def calculate_variance ( x, y,coeff):
n=len(x)
degree = len(coeff) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coeff[0] * x[i] + coeff[1] # calculate value of y on line for given x[i]
else :
y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2]
y_data = y[i] # y height of data point at x[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
quad_var_dutch = calculate_variance (years, dutch_men, dutch_quad_coeff)
quad_var_us = calculate_variance (years, us_men, us_quad_coeff)
print (f'Dutch Men Variance = {quad_var_dutch}')
print (f'US Men Variance = {quad_var_us}')
Dutch Men Variance = 0.14137533333314972
US Men Variance = 0.036083212121159454
# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error;
# round values to 2 decimal places
#
dutch_man_1955 = 180.23; us_man_1955 = 177.22
dutch_man_1995 = 182.54; us_man_1995 = 177.16
def perc_error (actual, predicated):
error = ((actual - predicated) / actual) * 100
return (round (error,3))
x = 1955
y = 1995
x_pred_lin_dutch = f(x)
x_pred_quad_dutch = q_f(x)
x_pred_lin_us = f_1(x)
x_pred_quad_us = q_f_1(x)
y_pred_lin_dutch = f(y)
y_pred_quad_dutch = q_f(y)
y_pred_lin_us = f_1(y)
y_pred_quad_us = q_f_1(y)
print ("YEAR PERSON ACTUAL AVRG LIN_PRED %ERROR for LIN QUAD_PRED %ERROR for Quad")
print (f"1955 DUTCH 180.23 {x_pred_lin_dutch} {perc_error(180.23,x_pred_lin_dutch)} {x_pred_quad_dutch} {perc_error(180.23,x_pred_quad_dutch)}")
print (f"1955 US 177.22 {x_pred_lin_us} {perc_error(177.22,x_pred_lin_us)} {x_pred_quad_us} {perc_error(177.22,x_pred_quad_us)}")
print (f"1995 DUTCH 182.54 {y_pred_lin_dutch} {(perc_error(182.54 ,y_pred_lin_dutch))*-1} {y_pred_quad_dutch} {(perc_error(182.54 ,y_pred_quad_dutch))*-1}")
print (f"1995 US 177.16 {y_pred_lin_us} {(perc_error(177.16 ,y_pred_lin_us))*-1} {y_pred_quad_us} {perc_error(177.16 ,y_pred_quad_us)}")
YEAR PERSON ACTUAL AVRG LIN_PRED %ERROR for LIN QUAD_PRED %ERROR for Quad
1955 DUTCH 180.23 179.17496969696975 0.585 179.95764015151508 0.151
1955 US 177.22 176.28457575757574 0.528 177.10569318181842 0.064
1995 DUTCH 182.54 185.11484848484855 1.411 183.30660984848328 0.42
1995 US 177.16 179.00287878787876 1.04 177.10581439393718 0.031
The best one is the porabola because it has a lower percent error than the prediction by the linear regression model.