# Import libraries
import numpy as np
import matplotlib.pyplot as plt
# Input exact values for comparison
#
dutch_man_1925 = 174.83; us_man_1925 = 174.53
dutch_man_1955 = 180.23; us_man_1955 = 177.22
dutch_man_1995 = 182.54; us_man_1995 = 177.16
#
dutch_woman_1925 = 162.2; us_woman_1925 = 160.97
dutch_woman_1955 = 167.11; us_woman_1955 = 163.54
dutch_woman_1995 = 168.73; us_woman_1995 = 163.56
# Read in the data and plot; add axes labels, plot title and legend
# Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use
# the command plt.legend ()
#
year = np.loadtxt ('human_heights.txt', usecols=(0), skiprows = 2)
dutchmen = np.loadtxt ('human_heights.txt', usecols=(1), skiprows = 2)
usmen = np.loadtxt ('human_heights.txt', usecols=(3), skiprows = 2)
#print('year', year)
#print('dutchmen', dutchmen)
#print('usmen', usmen)
plt.plot(year, dutchmen, 'bo', label = 'Dutch Men')
plt.plot(year, usmen,'ro', label = 'American Men')
plt.xlabel("Year")
plt.ylabel("Heights in cm")
plt.title("Yearly Dutch and American Male Heights")
plt.legend()
plt.show()
# Linear regression fit for both Dutch and U.S.; plot and print out the line
Dutchline_coeff = np.polyfit(year, dutchmen,1)
USline_coeff = np.polyfit(year, usmen, 1)
f = np.poly1d(Dutchline_coeff)
g = np.poly1d(USline_coeff)
yeareval = [1900, 1925, 1955, 1985]; Dutcheval = f(yeareval)
USeval = g(yeareval)
print(f"The equation of the Dutch line using linear regression is {Dutchline_coeff[0]}x \
{Dutchline_coeff[1]}")
print(f"The equation of the US line using linear regression is {USline_coeff[0]}x \
+{USline_coeff[1]}")
plt.plot(year, dutchmen, 'bo', label = 'Dutch Men')
plt.plot(yeareval, Dutcheval,'-b')
plt.plot(year, usmen,'ro', label = 'American Men')
plt.plot(yeareval, USeval,'r-')
plt.xlabel("Year")
plt.ylabel("Heights in cm")
plt.title("Yearly Dutch and American Male Heights")
plt.legend()
plt.show()
# Calculate the variance for each fit; use the function that we wrote in a previous notebook
# Input our function for calculating the variance
def calculate_variance ( x, y,coeff):
#
# Input: the x and y arrays for the data points, coefficients of line found using LR
#
# Output: variance
#
n=len(x)
degree = len(coeff) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coeff[0] * x[i] + coeff[1] # calculate value of y on line for given x[i]
else :
y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2]
y_data = y[i] # y height of data point at x[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
#
# Input: the x and y arrays for the data points, coefficients of line found using LR
#
# Output: variance
#
Dutchvarlinear = calculate_variance(year, dutchmen,Dutchline_coeff)
USvarlinear = calculate_variance(year, usmen, USline_coeff)
print("The variance for a linear fit to the Dutch men data is", Dutchvarlinear)
print("The variance for a linear fit to the US men data is", USvarlinear)
# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas
Dutchquad_coeff = np.polyfit(year, dutchmen,2)
USquad_coeff = np.polyfit(year, usmen, 2)
y = np.poly1d(Dutchquad_coeff)
z = np.poly1d(USquad_coeff)
Dutchquadeval = y(yeareval)
USquadeval = z(yeareval)
print(f"The equation for Dutch men using quadratic regression is {Dutchquad_coeff[2]}*x^2 + \
{Dutchquad_coeff[1]}*x + {Dutchquad_coeff[0]}")
print(f"The equation for US men using quadratic regression is {USquad_coeff[2]}*x^2 + \
{USquad_coeff[1]}*x + {USquad_coeff[0]}")
plt.plot(year, dutchmen, 'bo', label = 'Dutch Men')
plt.plot(yeareval, Dutchquadeval,'-b')
plt.plot(year, usmen,'ro', label = 'American Men')
plt.plot(yeareval, USquadeval,'r-')
plt.xlabel("Year")
plt.ylabel("Heights in cm")
plt.title("Yearly Dutch and American Male Heights")
plt.legend()
plt.show()
# Calculate variance for the quadratic fits
Dutchvarquad = calculate_variance(year, dutchmen,Dutchquad_coeff)
USvarquad = calculate_variance(year, usmen, USquad_coeff)
print("The variance for a quadratic fit to the Dutch men data is", Dutchvarquad)
print("The variance for a quadratic fit to the US men data is", USvarquad)
# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error;
# round values to 2 decimal places
#
Dutchpredictedquad1955 = y(1955)
Dutchpredictedquad1995 = y(1995)
USpredictedquad1955 = z(1955)
USpredictedquad1995 = z(1995)
Dutch1955se = (abs(Dutchpredictedquad1955 - dutch_man_1955)) / dutch_man_1955
Dutch1995se = (abs(Dutchpredictedquad1995 - dutch_man_1995)) / dutch_man_1995
us1995se = (abs(USpredictedquad1955 - us_man_1955)) / us_man_1955
us1995se = (abs(USpredictedquad1995 - us_man_1995)) / us_man_1995
print(f"The predicted heights for Dutch men in 1955 is {Dutchpredictedquad1955} and in 1995 is {Dutchpredictedquad1995}")
print(f"The predicted heights for American men in 1955 is {USpredictedquad1955} and in 1995 is {USpredictedquad1995}")
print(f"The Standard Error for 1955 is {Dutch1955se} for Dutch men and {Dutch1955se} for American men")
print(f"The Standard Error for 1955 is {us1995se} for Dutch men and {us1995se} for American men")