Project 4 - Regression

# Import libraries import numpy as np import matplotlib.pyplot as plt

# Input exact values for comparison # dutch_man_1925 = 174.83; us_man_1925 = 174.53 dutch_man_1955 = 180.23; us_man_1955 = 177.22 dutch_man_1995 = 182.54; us_man_1995 = 177.16 # dutch_woman_1925 = 162.2; us_woman_1925 = 160.97 dutch_woman_1955 = 167.11; us_woman_1955 = 163.54 dutch_woman_1995 = 168.73; us_woman_1995 = 163.56

# Read in the data and plot; add axes labels, plot title and legend # Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use # the command plt.legend () # years = np.loadtxt('human_heights.txt',skiprows=2,usecols=0) dutch_men = np.loadtxt('human_heights.txt',skiprows=2,usecols=1) us_men = np.loadtxt('human_heights.txt',skiprows=2,usecols=3) plt.plot(years,dutch_men,'ro',label='Dutch Men') plt.plot(years,us_men,'bo',label='U.S. Men') plt.xlabel = "Years" plt.ylabel = 'Height In CM' plt.legend()

# Linear regression fit for both Dutch and U.S.; plot and print out the line #Convert into arrays year_array = np.array(years) dutch_men_array = np.array(dutch_men) us_men_array = np.array(us_men) #calculate coefficients dutch_men_coefficient = np.polyfit(year_array,dutch_men_array,1) us_men_coefficient = np.polyfit(year_array,us_men_array,1) #print out equations print(f'The equation for dutch men is {round(dutch_men_coefficient[0],2)} * x + {round(dutch_men_coefficient[1],2)}') print(f'The equation for U.S. men is {round(us_men_coefficient[0],2)} * x + {round(us_men_coefficient[1],2)}') #write our function that evaluates the line at an array of x-values def eval_line(coeff,x_eval): return coeff[0] * x_eval + coeff[1] #create our lines dutch_men_line = eval_line(dutch_men_coefficient,year_array) us_men_line = eval_line(us_men_coefficient,year_array) #plot our lines plt.plot(year_array,dutch_men_line,'r',label='Dutch Men Line') plt.plot(year_array,us_men_line,'b',label='U.S. Men Line') plt.xlabel = 'Years' plt.ylabel = 'Height in CM' plt.title('Dutch vs U.S. Men Height Regression') plt.legend()

# Calculate the variance for each fit; use the function that we wrote in a previous notebook # # Input: the x and y arrays for the data points, coefficients of line found using LR # # Output: variance # def calculate_linear_variance(x,y,coeff): var = 0.0 n=len(x) for i in range(0,n): y_line = coeff[0] * x[i] + coeff[1] y_data= y[i] distance = y_data-y_line var = var + distance * distance var = (var)/float(n) return var #call our function dutch_men_linear_variance = calculate_linear_variance(year_array,dutch_men_array,dutch_men_coefficient) us_men_linear_variance = calculate_linear_variance(year_array,us_men_array,us_men_coefficient) print(f'The variance for dutch men is {dutch_men_linear_variance}') print(f'The variance for U.S. men is {us_men_linear_variance}')

# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas #calculate coefficients for quadratic dutch_men_coefficient_quadratic = np.polyfit(year_array,dutch_men_array,2) us_men_coefficient_quadratic = np.polyfit(year_array,us_men_array,2) #print out equations and use our poly1d function on our fit print(f'The equation for dutch men is {round(dutch_men_coefficient[0],2)} * x^2 + {round(dutch_men_coefficient[1],2)}') print(f'The equation for U.S. men is {round(us_men_coefficient[0],2)} * x^2 + {round(us_men_coefficient[1],2)}') one_d_dutch_men = np.poly1d(dutch_men_coefficient_quadratic) one_d_us_men = np.poly1d(us_men_coefficient_quadratic) #create our parabolas dutch_men_parabola = one_d_dutch_men(year_array) us_men_parabola = one_d_us_men(year_array) plt.plot(year_array,dutch_men_parabola,'r',label='Dutch Men Parabola') plt.plot(year_array,us_men_parabola,'b',label='U.S. Men Parabola') plt.xlabel = 'Years' plt.ylabel = 'Height in CM' plt.title('Dutch Men vs U.S. Men Quadratic Regression') plt.legend()

# Calculate variance for the quadratic fits def calculate_quadratic_variance(x,y,coeff): n=len(x) degree = len(coeff) - 1 var = 0 for i in range(0,n): if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] distance = y_data-y_line var = var + distance ** 2 var = ( var )/ float(n) return (var) dutch_men_quadratic_variance = calculate_quadratic_variance(year_array,dutch_men_array,dutch_men_coefficient_quadratic) us_men_quadratic_variance = calculate_quadratic_variance(year_array,us_men_array,us_men_coefficient_quadratic) print(f'The quadratic variance for dutch men is {dutch_men_quadratic_variance}') print(f'The quadratic variance for U.S. men is {us_men_quadratic_variance}')

# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error; # round values to 2 decimal places # #initialize our prediction years prediction_years = np.array([1955,1995]) def calculate_percent_error(predicted,actual): return abs((predicted-actual))/actual * 100 dutch_men_prediction = one_d_dutch_men(prediction_years); dutch_men_1955 = round(dutch_men_prediction[0], 2) dutch_men_1995 = round(dutch_men_prediction[1], 2) dutch_percent_error_1955 = round(calculate_percent_error(dutch_men_1955, dutch_man_1955), 2) dutch_percent_error_1995 = round(calculate_percent_error(dutch_men_1995, dutch_man_1995), 2) print(f'The prediction for a dutch man in 1955 is {dutch_men_1955} with a percent error of {dutch_percent_error_1955}') print(f'The prediction for a dutch man in 1995 is {dutch_men_1995} with a percent error of {dutch_percent_error_1995}') us_men_prediction = one_d_us_men(prediction_years); us_men_1955 = round(us_men_prediction[0], 2) us_men_1995 = round(us_men_prediction[1], 2) us_percent_error_1955 = round(calculate_percent_error(us_men_1955, us_man_1955), 2) us_percent_error_1995 = round(calculate_percent_error(us_men_1995, us_man_1995), 2) print('**********************************************************************') print(f'The prediction for a U.S. man in 1955 is {us_men_1955} with a percent error of {us_percent_error_1955}') print(f'The prediction for a U.S. man in 1995 is {us_men_1995} with a percent error of {us_percent_error_1995}')

#Our data does not have a linear relationship, and the linear model has a higher variance than the quadratic model. This higher variance hampers prediction. #So the superior model is the quadratic model