Project 4 - Regression

# Import libraries import numpy as np import matplotlib.pyplot as plt

# Input exact values for comparison # dutch_man_1925 = 174.83; us_man_1925 = 174.53 dutch_man_1955 = 180.23; us_man_1955 = 177.22 dutch_man_1995 = 182.54; us_man_1995 = 177.16 # dutch_woman_1925 = 162.2; us_woman_1925 = 160.97 dutch_woman_1955 = 167.11; us_woman_1955 = 163.54 dutch_woman_1995 = 168.73; us_woman_1995 = 163.56

# Read in the data and plot; add axes labels, plot title and legend # Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use # the command plt.legend () # filename= 'human_heights.txt' years=np.loadtxt(filename, skiprows=2, usecols=(0)) dutch_men = np.loadtxt(filename, skiprows=2, usecols=(1)) dutch_women= np.loadtxt(filename, skiprows=2, usecols=(2)) us_men = np.loadtxt(filename, skiprows=2, usecols=(3)) us_women= np.loadtxt(filename, skiprows=2, usecols=(4)) #women plot #girlbosses plt.plot(years, dutch_women, 'mo', label= 'Dutch Women') plt.plot (years, us_women, 'co', label= 'US Women') plt.xlabel ('years') plt.ylabel ('height in cm') plt.title ('Height of Dutch Women vs Height of US Women') plt.legend()

# Linear regression fit for both Dutch and U.S.; plot and print out the line #US women x= np.array([years]) y= np.array([us_women]) coeff1= np.polyfit(years,us_women,1) print (coeff1) print (f'the equation of the line that fits for US Women using linear regression is {round(coeff1[0],3)}x + {round(coeff1[1],3)}') plt.plot(years, us_women, 'co') plt.xlabel ('years') plt.ylabel ('height in cm') yrmin=np.min(years) ; yrmax= np.max(years) xx=np.linspace (yrmin, yrmax) f= np.poly1d(coeff1) yy=f(xx) plt.plot(xx,f(xx),'m') plt.title ('Linear Regression fit to Data of US Women') plt.show() #Dutch women x1= np.array([years]) y1= np.array([dutch_women]) coeff2 =np.polyfit(years,dutch_women,1) print (coeff2) print (f'the equation of the line that fits for Dutch Women using linear regression is {round(coeff2[0],3)}x + {round(coeff2[1],3)}') plt.plot(years, dutch_women, 'co') plt.xlabel ('years') plt.ylabel ('height in cm') yrmin=np.min(years) ; yrmax= np.max(years) x1x=np.linspace (yrmin, yrmax) f1= np.poly1d(coeff2) y1y=f1(x1x) plt.plot(x1x,y1y,'m') plt.title ('Linear Regression fit to Data of Dutch Women') plt.show()

[ 0.06112121 43.28224242]
the equation of the line that fits for US Women using linear regression is 0.061x + 43.282

[   0.14876364 -124.89727273]
the equation of the line that fits for Dutch Women using linear regression is 0.149x + -124.897

# Calculate the variance for each fit; use the function that we wrote in a previous notebook #variance for us women def calculate_variance ( x, y,coeff): n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) var = calculate_variance(years, us_women,coeff1) print (f' the variance for us women is', var) #variance for dutch women def calculate_variance ( x, y,coeff): n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) var = calculate_variance(years, dutch_women,coeff2) print (f' the variance for dutch women is', var ) # # Input: the x and y arrays for the data points, coefficients of line found using LR # # Output: variance #

 the variance for us women is 0.5499438787878755
 the variance for dutch women is 1.743654909090882

# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas #us women coeff1_quad = np.polyfit (years, us_women,2) print (f' the equation of the parabola found using quadratic regression for US women is {coeff1_quad[0]}x^2 \ +{coeff1_quad[1]}x + {coeff1_quad[2]}') plt.plot(years, us_women, 'co') plt.xlabel ('years') plt.ylabel ('height in cm') yrmin=np.min(years) ; yrmax= np.max(years) xx=np.linspace (yrmin, yrmax) f= np.poly1d(coeff1_quad) yy=f(xx) plt.plot(xx,yy,'m') plt.title ('Linear Regression fit to Data of US Women') plt.show() #dutch women coeff2_quad = np.polyfit (years, dutch_women, 2) print (f' the equation of the parabola found using quadratic regression for Dutch women is {coeff2_quad[0]}x^2 \ +{coeff2_quad[1]}x + {coeff2_quad[2]}') plt.plot(years, dutch_women, 'co') plt.xlabel ('years') plt.ylabel ('height in cm') yrmin=np.min(years) ; yrmax= np.max(years) x1x=np.linspace (yrmin, yrmax) f1= np.poly1d(coeff2_quad) y1y=f1(x1x) plt.plot(x1x,y1y,'m') plt.title ('Linear Regression fit to Data of Dutch Women') plt.show()

 the equation of the parabola found using quadratic regression for US women is -0.0009496212121225872x^2 +3.7551477272781053x + -3548.3751060658624

 the equation of the parabola found using quadratic regression for Dutch women is -0.0018022727272749338x^2 +7.159604545463183x + -6941.453181826632

# Calculate variance for the quadratic fits #variance for us women def calculate_variance ( x, y,coeff): n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) var = calculate_variance(years, us_women,coeff1_quad) print (f' the variance for us women is', var ) #variance for dutch women def calculate_variance ( x, y,coeff): n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) var = calculate_variance(years, dutch_women,coeff2_quad) print (f' the variance for dutch women is', var )

 the variance for us women is 0.07380380303032785
 the variance for dutch women is 0.028612181818186415

# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error; # round values to 2 decimal places # #quadratic was chosen because visually the line fit the data better and the variances were smaller than with linear #us womens quadratic #-0.001 +x^2 + 3.755 print (f' predicted US Womens height in 1955 is {round(-0.0009496212121225872*1955*1955 +3.7551477272781053*1955 + -3548.37510606586242,2)}') print (f' predicted US Womens height in 1995 is {round(-0.0009496212121225872*1995*1995 +3.7551477272781053*1995 + -3548.37510606586242,2)}') #dutch women print (f' predicted Dutch Womens height in 1955 is {round(-0.0018022727272749338*1955*1955 +7.159604545463183*1955 + -6941.453181826632,2)}') print (f' predicted Dutch Womens height in 1995 is {round(-0.0018022727272749338*1995*1995 +7.159604545463183*1995 + -6941.453181826632,2)}') #actual heights were dutch_woman_1955 = 167.11; us_woman_1955 = 163.54; #dutch_woman_1995 = 168.73; us_woman_1995 = 163.56') #percent error for US women print (f' US womens 1955 percent error is {round((abs(163.46-163.54)/163.54)*100,2)}') print (f' US womens 1995 percent error is {round((abs(163.66-163.56)/163.56)*100,2)}') #percent error for dutch women print (f' Dutch womens 1955 percent error is {round((abs(167.24-167.11)/167.11)*100,2)}') print (f' Dutch womens 1995 percent error is {round((abs(168.87-168.73)/168.73)*100,2)}') #i know there was a better way to go about this but i just like to complicate things

 predicted US Womens height in 1955 is 163.46
 predicted US Womens height in 1995 is 163.63
 predicted Dutch Womens height in 1955 is 167.24
 predicted Dutch Womens height in 1995 is 168.87
 actual heights were dutch_woman_1955 = 167.11; us_woman_1955 = 163.54; dutch_woman_1995 = 168.73; us_woman_1995 = 163.56
 US womens 1955 percent error is 0.05
 US womens 1995 percent error is 0.06
 Dutch womens 1955 percent error is 0.08
 Dutch womens 1995 percent error is 0.08