Project 4 - Regression

# Import libraries import numpy as np import matplotlib.pyplot as plt

# Input exact values for comparison # dutch_man_1925 = 174.83; us_man_1925 = 174.53 dutch_man_1955 = 180.23; us_man_1955 = 177.22 dutch_man_1995 = 182.54; us_man_1995 = 177.16 # # dutch_woman_1925 = 162.2; us_woman_1925 = 160.97 # dutch_woman_1955 = 167.11; us_woman_1955 = 163.54 # dutch_woman_1995 = 168.73; us_woman_1995 = 163.56

# Read in the data and plot; add axes labels, plot title and legend # Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use # the command plt.legend () # dutchMen = np.loadtxt('human_heights.txt', usecols=1, skiprows=2) usMen = np.loadtxt('human_heights.txt', usecols=3, skiprows=2) years = np.loadtxt('human_heights.txt', usecols=0, skiprows=2) plt.plot(years, dutchMen, label='Dutch Man') plt.plot(years, usMen, label='US Men') plt.xlabel('Years') plt.ylabel('Height in cm') plt.title('Average Heights of US and Dutch Men over the last century') plt.legend()

# Linear regression fit for both Dutch and U.S.; plot and print out the line #first for dutch men dutchCoeff = np.polyfit(years,dutchMen,1) f = np.poly1d(dutchCoeff) yearFit = np.linspace(min(years),max(years),len(years)) #us men usCoeff = np.polyfit(years,usMen,1) g = np.poly1d(usCoeff) #plot plt.plot(years, dutchMen,'ro', label='Dutch Man') plt.plot(years, usMen,'bo', label='US Men') plt.plot(years,f(yearFit),label='Regression Line') plt.plot(years,g(yearFit),label='Regression line for US') plt.xlabel('Years') plt.ylabel('Height in cm') plt.title('Average Heights of US and Dutch Men over the last century') plt.legend()

# Calculate the variance for each fit; use the function that we wrote in a previous notebook # # Input: the x and y arrays for the data points, coefficients of line found using LR # # Output: variance # def findVariance (x,y,coeff): n = len(x) degree = len(coeff)-1 var = 0.0 for i in range(0,n): if (degree ==1): yLine = (coeff[0] * x[i]) + coeff[1] else: yLine = (coeff[0]*x[i]*x[i]) + (coeff[1]*x[i]) + coeff[2] yData = y[i] distance = yData-yLine var = (var+distance)*distance var = (var)/float(n) return (var) varDutch = findVariance(years,dutchMen,dutchCoeff) varUS = findVariance(years,usMen,usCoeff) print (f'the dutch variance is {varDutch}') print (f'the us variance is {varUS}')

# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas #first for dutch men dutchCoeffQ = np.polyfit(years,dutchMen,2) h = np.poly1d(dutchCoeffQ) yearFit = np.linspace(min(years),max(years),len(years)) #us men usCoeffQ = np.polyfit(years,usMen,2) k = np.poly1d(usCoeffQ) #plot plt.plot(years, dutchMen,'ro', label='Dutch Man') plt.plot(years, usMen,'bo', label='US Men') plt.plot(years,h(yearFit),label='Regression Line for Dutch') plt.plot(years,k(yearFit),label='Regression line for US') plt.xlabel('Years') plt.ylabel('Height in cm') plt.title('Average Heights of US and Dutch Men over the last century') plt.legend()

# Calculate variance for the quadratic fits varDutch = findVariance(years,dutchMen,dutchCoeffQ) varUS = findVariance(years,usMen,usCoeffQ) print (f'the dutch variance is {varDutch}') print (f'the us variance is {varUS}')

# Use best fit to predict average heights in 1955 and 1995 #for both Dutch and U.S.; compute percent error; # round values to 2 decimal places # print (f'the predicted Dutch height in 1955 is {round(h(1955),2)} and in 1995 is {round(h(1995),2)}') print (f'the predicted us height in 1955 is {round(k(1955),2)} and in 1995 is {round(k(1995),2)}.')

def per_error(approx,exact): error = ((approx-exact)/exact)*100 if (error<0): error *= -1 roundError = round(error,2) return roundError derror1 = per_error(h(1955),dutch_man_1955) derror2 = per_error(h(1995),dutch_man_1995) print (f'dutch 1955 percent error:{derror1} 1995 error: {derror2}') uerror1 = per_error(k(1955),us_man_1955) uerror2 = per_error(k(1995),us_man_1995) print (f'us 1955 percent error:{uerror1} 1995 error: {uerror2}')