Project 4 - Regression

# Import libraries import numpy as np import matplotlib.pyplot as plt

# Input exact values for comparison # dutch_man_1925 = 174.83; us_man_1925 = 174.53 dutch_man_1955 = 180.23; us_man_1955 = 177.22 dutch_man_1995 = 182.54; us_man_1995 = 177.16

# Read in the data and plot; add axes labels, plot title and legend # Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use # the command plt.legend () # years = np.loadtxt('human_heights.txt', skiprows=2, usecols=0) dutch_men = np.loadtxt('human_heights.txt', skiprows=2, usecols=2) us_men = np.loadtxt('human_heights.txt', skiprows=2, usecols=4) plt.plot(years, dutch_men, 'ro', label="Dutch Men") plt.plot(years, us_men, 'bo', label="US Men") plt.xlabel = "Years" plt.ylabel = "Height (cm)" plt.title("US Men height vs Dutch Men height") plt.legend()

# Linear regression fit for both Dutch and U.S.; plot and print out the line y_array = np.array(years) dutch_men_array = np.array(dutch_men) us_men_array = np.array(us_men) dutch_men_coefficient = np.polyfit(y_array, dutch_men_array, 1) us_men_coefficient = np.polyfit(y_array, us_men_array, 1) print(f"Dutch Men heights: y = {round(dutch_men_coefficient[0], 2)} * x + {round(dutch_men_coefficient[1], 2)}") print(f"US Men heights: y = {round(us_men_coefficient[0], 2)} * x + {round(us_men_coefficient[1], 2)}") def eval_line (coeff, x_eval) : return (coeff[0] * x_eval + coeff[1]) dutch_men_line = eval_line(dutch_men_coefficient, y_array) us_men_line = eval_line(us_men_coefficient, y_array) plt.plot(y_array, dutch_men_line, 'r', label="Dutch Men") plt.plot(y_array, us_men_line, 'b', label="US Men") plt.xlabel = "Years" plt.ylabel = "Height (cm)" plt.title("US Men height vs Dutch Men height") plt.legend()

# Calculate the variance for each fit; use the function that we wrote in a previous notebook # # Input: the x and y arrays for the data points, coefficients of line found using LR # # Output: variance # def linear_variance(x, y, coeff): var = 0. n = len(x) for i in range(0,n) : y_line = coeff[0] * x[i] + coeff[1] y_data = y[i] distance = y_data-y_line var = var + distance **2 var = (var)/ float(n) return (var) dutch_men_linear_variance = linear_variance(years, dutch_men, dutch_men_coefficient) us_men_linear_variance = linear_variance(years, us_men, us_men_coefficient) print(f"Linear variance for US: {us_men_linear_variance}") print(f"Linear variance for Dutch: {dutch_men_linear_variance}")

# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas dutch_quad = np.polyfit(y_array, dutch_men_array, 2) us_quad = np.polyfit(y_array, us_men_array, 2) print(f"US men heights: y = {round(us_quad[0], 2)} * x^2 + {round(us_quad[1], 2)} * x + {round(us_quad[2], 2)}") print(f"Dutch men heights: y = {round(dutch_quad[0], 2)} * x^2 + {round(dutch_quad[1], 2)} * x + {round(dutch_quad[2], 2)}") dutch_f = np.poly1d(dutch_quad) us_f = np.poly1d(us_quad) dutch_parabola = dutch_f(y_array) us_parabola = us_f(y_array) plt.plot(y_array, dutch_parabola, 'r', label="Dutch men Quadratic") plt.plot(y_array, us_parabola, 'b', label="US men Quadratic") plt.xlabel = "Years" plt.ylabel = "Height (cm)" plt.title("US men height vs Dutch men height") plt.legend()

# Calculate variance for the quadratic fits def quadratic_variance (x, y, coeff): n=len(x) degree = len(coeff) -1 var = 0. for i in range(0,n): if (degree == 1): y_line = coeff[0] * x[i] + coeff[1] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] distance = y_data-y_line var = var + distance ** 2 var = (var)/ float(n) return (var) dutch_quadratic_variance = quadratic_variance(years, dutch_men, dutch_quad) us_quadratic_variance = quadratic_variance(years, us_men, us_quad) print(f"Quadratic variance for US: {us_quadratic_variance}") print(f"Quadratic variance for Dutch: {dutch_quadratic_variance}")

# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error; # round values to 2 decimal places # unknownyears = np.array([1955, 1995]) def percent_error(predicted, actual) : return abs((predicted - actual))/actual * 100 us_estimate = us_f(unknownyears); us_1955 = round(us_estimate[0], 2) us_1995 = round(us_estimate[1], 2) predicted_us_1955 = round(percent_error(us_1955, us_1955), 2) predicted_us_1995 = round(percent_error(us_1995, us_1995), 2) dutch_estimate = dutch_f(unknownyears); dutch_1955 = round(dutch_estimate[0], 2) dutch_1995 = round(dutch_estimate[1], 2) predicted_dutch_1955 = round(percent_error(dutch_1955, dutch_1955), 2) predicted_dutch_1995 = round(percent_error(dutch_1995, dutch_1995), 2) print("1955:") print(f"Predicted average height for US men: {us_1955}. percent error: {predicted_us_1955}%") print(f"Predicted average height for Dutch men: {dutch_1955}. percent error: {predicted_dutch_1955}%") print("1995:") print(f"Predicted average height for US men: {us_1995}. percent error: {predicted_us_1995}%") print(f"Predicted average height for Dutch men: {dutch_1995}. percent error: {predicted_dutch_1995}%")

Since the data is non linear quadratic regression is the best model to use.