Project 4 - Regression

# Import libraries import numpy as np import matplotlib.pyplot as plt

# Input exact values for comparison # dutch_man_1925 = 174.83; us_man_1925 = 174.53 dutch_man_1955 = 180.23; us_man_1955 = 177.22 dutch_man_1995 = 182.54; us_man_1995 = 177.16 # dutch_woman_1925 = 162.2; us_woman_1925 = 160.97 dutch_woman_1955 = 167.11; us_woman_1955 = 163.54 dutch_woman_1995 = 168.73; us_woman_1995 = 163.56 dutchmen = np.loadtxt('human_heights.txt', skiprows=2, usecols=2) usmen = np.loadtxt('human_heights.txt', skiprows=2, usecols=4) print ('Dutch Men Height', dutchmen) print ('US Men Height', usmen)

# Read in the data and plot; add axes labels, plot title and legend # Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use # the command plt.legend () # year = np.loadtxt('human_heights.txt', skiprows=2, usecols=0) dutchmen = np.loadtxt('human_heights.txt', skiprows=2, usecols=2) usmen = np.loadtxt('human_heights.txt', skiprows=2, usecols=4) plt.plot(year, dutchmen, 'bo', label='Dutch Men') plt.plot(year, usmen, 'ro', label='US Men') plt.xlabel = "Years" plt.ylabel = "Height" plt.title("Dutch Men vs Dutch Men Height") plt.legend()

# Linear regression fit for both Dutch and U.S.; plot and print out the line plt.plot(year, dutchmen, 'bo', label='Dutch Men') plt.plot(year, usmen, 'ro', label='US Men') plt.xlabel = "Years" plt.ylabel = "Height" plt.title("Dutch Men vs Dutch Men Height") plt.legend() line1= np.polyfit(year,dutchmen, 1) f1= np.poly1d(line1) xx=np.linspace(2000, 1900, 10) yy=f1(xx) plt.plot(xx,yy,'b') plt.plot(year, dutchmen, 'bo', label='Dutch Men') plt.plot(year, usmen, 'ro', label='US Men') plt.xlabel = "Years" plt.ylabel = "Height" plt.title("Dutch Men vs Dutch Men Height") plt.legend() line2= np.polyfit(year,usmen, 1) f2= np.poly1d(line2) xx=np.linspace(2000, 1900, 10) yy=f2(xx) plt.plot(xx,yy,'r') plt.show()

# Input our function for calculating the variance # # Input: the x and y arrays for the data points, coefficients of line found using LR # # Output: variance def calculate_variance ( x, y,coeff): n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] # calculate value of y on line for given x[i] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] # y height of data point at x[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) # var = calculate_variance(dutchmen, usmen,line1) print(' The variance of the linear fit to data is',var) print(' The standard deviaiton of the linear fit to data is',np.sqrt(var)) def calculate_variance ( x, y,coeff): n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] # calculate value of y on line for given x[i] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] # y height of data point at x[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) # var = calculate_variance(dutchmen, usmen,line2) print(' The variance of the linear fit to data is',var) print(' The standard deviaiton of the linear fit to data is',np.sqrt(var))

# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas parabola_coeff1 = np.polyfit(year,dutchmen,2) print(f"The equation of the parabola using regression is {parabola_coeff1[0]}x^2 \ +{parabola_coeff1[1]}x + {parabola_coeff1[2]}") parabola_coeff2 = np.polyfit(year,usmen,2) print(f"The equation of the parabola using regression is {parabola_coeff2[0]}x^2 \ +{parabola_coeff2[1]}x + {parabola_coeff2[2]}") plt.plot(year, dutchmen, 'bo', label='Dutch Men') plt.plot(year, usmen, 'ro', label='US Men') plt.xlabel = "Years" plt.ylabel = "Height" plt.title("Dutch Men vs Dutch Men Height") plt.legend() # # Create data to plot parabola xx1=np.linspace(2000, 1900, 10) f1= np.poly1d(parabola_coeff1) yy1=f1(xx1) plt.plot(xx1,yy1,'b') xx2=np.linspace(2000, 1900, 10) f2= np.poly1d(parabola_coeff2) yy2=f2(xx2) plt.plot(xx2,yy2,'r') plt.show()

# Calculate variance for the quadratic fits coeffs_quad1 = np.polyfit (year,dutchmen,2) print (f"Quadratic polynomial is {coeffs_quad1[2]}*x^2 +{ coeffs_quad1[1]}*x +\ { coeffs_quad1[0]}") # g1 = np.poly1d( coeffs_quad1) var_quad1 = calculate_variance(year, dutchmen,coeffs_quad1) print("The variance for a quadratic fit to data is", var_quad1) coeffs_quad2 = np.polyfit (year,usmen,2) print (f"Quadratic polynomial is {coeffs_quad2[2]}*x^2 +{ coeffs_quad2[1]}*x +\ { coeffs_quad2[0]}") # g2 = np.poly1d( coeffs_quad2) var_quad2 = calculate_variance(year, usmen,coeffs_quad2) print("The variance for a quadratic fit to data is", var_quad2)

# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error; # round values to 2 decimal places # print (f1(1955)) print (g1(1955)) ####my values are coming out the same