Project 4 - Regression

# Import libraries import numpy as np import matplotlib.pyplot as plt

# Input exact values for comparison # dutch_man_1925 = 174.83; us_man_1925 = 174.53 dutch_man_1955 = 180.23; us_man_1955 = 177.22 dutch_man_1995 = 182.54; us_man_1995 = 177.16 # dutch_woman_1925 = 162.2; us_woman_1925 = 160.97 dutch_woman_1955 = 167.11; us_woman_1955 = 163.54 dutch_woman_1995 = 168.73; us_woman_1995 = 163.56

# Read in the data and plot; add axes labels, plot title and legend # Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use # the command plt.legend () # filename= 'human_heights.txt' years=np.loadtxt(filename, skiprows=2, usecols=(0)) dutch_men = np.loadtxt(filename, skiprows=2, usecols=(1)) dutch_women= np.loadtxt(filename, skiprows=2, usecols=(2)) us_men = np.loadtxt(filename, skiprows=2, usecols=(3)) us_women= np.loadtxt(filename, skiprows=2, usecols=(4)) #women plot #girlbosses plt.plot(years, dutch_women, 'mo', label= 'Dutch Women') plt.plot (years, us_women, 'co', label= 'US Women') plt.xlabel ('years') plt.ylabel ('height in cm') plt.title ('Height of Dutch Women vs Height of US Women') plt.legend()

# Linear regression fit for both Dutch and U.S.; plot and print out the line #US women x= np.array([years]) y= np.array([us_women]) coeff1= np.polyfit(years,us_women,1) print (coeff1) print (f'the equation of the line that fits for US Women using linear regression is {round(coeff1[0],3)}x + {round(coeff1[1],3)}') plt.plot(years, us_women, 'co') plt.xlabel ('years') plt.ylabel ('height in cm') yrmin=np.min(years) ; yrmax= np.max(years) xx=np.linspace (yrmin, yrmax) f= np.poly1d(coeff1) yy=f(xx) plt.plot(xx,f(xx),'m') plt.title ('Linear Regression fit to Data of US Women') plt.show() #Dutch women x1= np.array([years]) y1= np.array([dutch_women]) coeff2 =np.polyfit(years,dutch_women,1) print (coeff2) print (f'the equation of the line that fits for Dutch Women using linear regression is {round(coeff2[0],3)}x + {round(coeff2[1],3)}') plt.plot(years, dutch_women, 'co') plt.xlabel ('years') plt.ylabel ('height in cm') yrmin=np.min(years) ; yrmax= np.max(years) x1x=np.linspace (yrmin, yrmax) f1= np.poly1d(coeff2) y1y=f1(x1x) plt.plot(x1x,y1y,'m') plt.title ('Linear Regression fit to Data of Dutch Women') plt.show()

# Calculate the variance for each fit; use the function that we wrote in a previous notebook #variance for us women def calculate_variance ( x, y,coeff): n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) var = calculate_variance(years, us_women,coeff1) print (f' the variance for us women is', var) #variance for dutch women def calculate_variance ( x, y,coeff): n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) var = calculate_variance(years, dutch_women,coeff2) print (f' the variance for dutch women is', var ) # # Input: the x and y arrays for the data points, coefficients of line found using LR # # Output: variance #

# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas #us women coeff1_quad = np.polyfit (years, us_women,2) print (f' the equation of the parabola found using quadratic regression for US women is {coeff1_quad[0]}x^2 \ +{coeff1_quad[1]}x + {coeff1_quad[2]}') plt.plot(years, us_women, 'co') plt.xlabel ('years') plt.ylabel ('height in cm') yrmin=np.min(years) ; yrmax= np.max(years) xx=np.linspace (yrmin, yrmax) f= np.poly1d(coeff1_quad) yy=f(xx) plt.plot(xx,yy,'m') plt.title ('Linear Regression fit to Data of US Women') plt.show() #dutch women coeff2_quad = np.polyfit (years, dutch_women, 2) print (f' the equation of the parabola found using quadratic regression for Dutch women is {coeff2_quad[0]}x^2 \ +{coeff2_quad[1]}x + {coeff2_quad[2]}') plt.plot(years, dutch_women, 'co') plt.xlabel ('years') plt.ylabel ('height in cm') yrmin=np.min(years) ; yrmax= np.max(years) x1x=np.linspace (yrmin, yrmax) f1= np.poly1d(coeff2_quad) y1y=f1(x1x) plt.plot(x1x,y1y,'m') plt.title ('Linear Regression fit to Data of Dutch Women') plt.show()

# Calculate variance for the quadratic fits #variance for us women def calculate_variance ( x, y,coeff): n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) var = calculate_variance(years, us_women,coeff1_quad) print (f' the variance for us women is', var ) #variance for dutch women def calculate_variance ( x, y,coeff): n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) var = calculate_variance(years, dutch_women,coeff2_quad) print (f' the variance for dutch women is', var )

# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error; # round values to 2 decimal places # #quadratic was chosen because visually the line fit the data better and the variances were smaller than with linear #us womens quadratic #-0.001 +x^2 + 3.755 print (f' predicted US Womens height in 1955 is {round(-0.0009496212121225872*1955*1955 +3.7551477272781053*1955 + -3548.37510606586242,2)}') print (f' predicted US Womens height in 1995 is {round(-0.0009496212121225872*1995*1995 +3.7551477272781053*1995 + -3548.37510606586242,2)}') #dutch women print (f' predicted Dutch Womens height in 1955 is {round(-0.0018022727272749338*1955*1955 +7.159604545463183*1955 + -6941.453181826632,2)}') print (f' predicted Dutch Womens height in 1995 is {round(-0.0018022727272749338*1995*1995 +7.159604545463183*1995 + -6941.453181826632,2)}') #actual heights were dutch_woman_1955 = 167.11; us_woman_1955 = 163.54; #dutch_woman_1995 = 168.73; us_woman_1995 = 163.56') #percent error for US women print (f' US womens 1955 percent error is {round((abs(163.46-163.54)/163.54)*100,2)}') print (f' US womens 1995 percent error is {round((abs(163.66-163.56)/163.56)*100,2)}') #percent error for dutch women print (f' Dutch womens 1955 percent error is {round((abs(167.24-167.11)/167.11)*100,2)}') print (f' Dutch womens 1995 percent error is {round((abs(168.87-168.73)/168.73)*100,2)}') #i know there was a better way to go about this but i just like to complicate things