Project 4 - Regression

# Import libraries import numpy as np import matplotlib.pyplot as plt

# Input exact values for comparison # dutch_man_1925 = 174.83; us_man_1925 = 174.53 dutch_man_1955 = 180.23; us_man_1955 = 177.22 dutch_man_1995 = 182.54; us_man_1995 = 177.16 # dutch_woman_1925 = 162.2; us_woman_1925 = 160.97 dutch_woman_1955 = 167.11; us_woman_1955 = 163.54 dutch_woman_1995 = 168.73; us_woman_1995 = 163.56

# Read in the data and plot; add axes labels, plot title and legend # Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use # the command plt.legend () # my_data = np.loadtxt('human_heights.txt', skiprows = 2) years = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 0) dutch_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 1) us_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 3) print (years) print (dutch_men) print (us_men) plt.plot (years, dutch_men, 'bo', label = 'Dutch Men') plt.plot (years, us_men, 'ro', label = 'US Men') plt.title ("Dutch Men vs US Men Average Height") plt.xlabel ("Time (years)") plt.ylabel ("Average Height (cm)") plt.legend ()

# Linear regression fit for both Dutch and U.S.; plot and print out the line dutch_line_coeff = np.polyfit (years ,dutch_men,1) print (f"equation of the line is for Dutch Men is = {dutch_line_coeff[0]}x + {dutch_line_coeff[1]}" ) f = np.poly1d(dutch_line_coeff) us_line_coeff = np.polyfit (years ,us_men,1) print (f"equation of the line is for US Men is = {us_line_coeff[0]}x + {us_line_coeff[1]}" ) f_1 = np.poly1d(us_line_coeff) my_data = np.loadtxt('human_heights.txt', skiprows = 2) years = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 0) dutch_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 1) us_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 3) plt.plot (years, dutch_men, 'bo', label = 'Dutch Men') plt.plot (years, us_men, 'ro', label = 'US Men') plt.title ("Dutch Men vs US Men Average Height") plt.xlabel ("Time (years)") plt.ylabel ("Average Height (cm)") plt.plot(years, f(years),'b') plt.plot(years, f_1(years),'r') plt.legend ()

# Calculate the variance for each fit; use the function that we wrote in a previous notebook def calculate_variance ( x, y,coeff): n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] # calculate value of y on line for given x[i] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] # y height of data point at x[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) # Input: the x and y arrays for the data points, coefficients of line found using LR var_dutch = calculate_variance_linear (years, dutch_men, dutch_line_coeff) var_us = calculate_variance_linear (years, us_men, us_line_coeff) # Output: variance print (f"Dutch Variance = {var_dutch}") print (f"US Variance {var_us}")

# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas dutch_quad_coeff = np.polyfit (years ,dutch_men,2) print (f"equation of the line is for Dutch Men is = {dutch_quad_coeff[0]}x + {dutch_quad_coeff[1]}" ) q_f = np.poly1d(dutch_quad_coeff) us_quad_coeff = np.polyfit (years ,us_men,2) print (f"equation of the line is for US Men is = {us_quad_coeff[0]}x + {us_quad_coeff[1]}" ) q_f_1 = np.poly1d(us_quad_coeff) my_data = np.loadtxt('human_heights.txt', skiprows = 2) years = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 0) dutch_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 1) us_men = np.loadtxt('human_heights.txt', skiprows = 2, usecols= 3) plt.plot (years, dutch_men, 'bo', label = 'Dutch Men') plt.plot (years, us_men, 'ro', label = 'US Men') plt.title ("Dutch Men vs US Men Average Height") plt.xlabel ("Time (years)") plt.ylabel ("Average Height (cm)") plt.plot(years, q_f(years),'b') plt.plot(years, q_f_1(years),'r') plt.legend ()

# Calculate variance for the quadratic fits def calculate_variance ( x, y,coeff): n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] # calculate value of y on line for given x[i] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] # y height of data point at x[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) quad_var_dutch = calculate_variance (years, dutch_men, dutch_quad_coeff) quad_var_us = calculate_variance (years, us_men, us_quad_coeff) print (f'Dutch Men Variance = {quad_var_dutch}') print (f'US Men Variance = {quad_var_us}')

# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error; # round values to 2 decimal places # dutch_man_1955 = 180.23; us_man_1955 = 177.22 dutch_man_1995 = 182.54; us_man_1995 = 177.16 def perc_error (actual, predicated): error = ((actual - predicated) / actual) * 100 return (round (error,3)) x = 1955 y = 1995 x_pred_lin_dutch = f(x) x_pred_quad_dutch = q_f(x) x_pred_lin_us = f_1(x) x_pred_quad_us = q_f_1(x) y_pred_lin_dutch = f(y) y_pred_quad_dutch = q_f(y) y_pred_lin_us = f_1(y) y_pred_quad_us = q_f_1(y) print ("YEAR PERSON ACTUAL AVRG LIN_PRED %ERROR for LIN QUAD_PRED %ERROR for Quad") print (f"1955 DUTCH 180.23 {x_pred_lin_dutch} {perc_error(180.23,x_pred_lin_dutch)} {x_pred_quad_dutch} {perc_error(180.23,x_pred_quad_dutch)}") print (f"1955 US 177.22 {x_pred_lin_us} {perc_error(177.22,x_pred_lin_us)} {x_pred_quad_us} {perc_error(177.22,x_pred_quad_us)}") print (f"1995 DUTCH 182.54 {y_pred_lin_dutch} {(perc_error(182.54 ,y_pred_lin_dutch))*-1} {y_pred_quad_dutch} {(perc_error(182.54 ,y_pred_quad_dutch))*-1}") print (f"1995 US 177.16 {y_pred_lin_us} {(perc_error(177.16 ,y_pred_lin_us))*-1} {y_pred_quad_us} {perc_error(177.16 ,y_pred_quad_us)}")

The best one is the porabola because it has a lower percent error than the prediction by the linear regression model.