Python Project 4 - Regression

# Import libraries import numpy as np import matplotlib.pyplot as plt

# Input exact values for comparison # dutch_man_1925 = 174.83; us_man_1925 = 174.53 dutch_man_1955 = 180.23; us_man_1955 = 177.22 dutch_man_1995 = 182.54; us_man_1995 = 177.16 # dutch_woman_1925 = 162.2; us_woman_1925 = 160.97 dutch_woman_1955 = 167.11; us_woman_1955 = 163.54 dutch_woman_1995 = 168.73; us_woman_1995 = 163.56 filename = "human_heights.txt" year = np.loadtxt(filename, usecols = 0, skiprows = 2) d_women = np.loadtxt(filename, usecols = 2, skiprows = 2) u_women = np.loadtxt(filename, usecols = 4, skiprows = 2)

# Read in the data and plot; add axes labels, plot title and legend # Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use # the command plt.legend () coef_line1 = np.polyfit(u_women, d_women, 1) f = np.poly1d(coef_line1) plt.plot(year, u_women, "b*", label = "American Women") plt.plot(year, d_women, "r*", label = "Dutch Women") plt.xlabel("Year") plt.ylabel("Height (cm)") plt.title("Heights of US vs Dutch Women") plt.legend()

# Linear regression fit for both Dutch and U.S.; plot and print out the line #Dutch women coef_line1 = np.polyfit(year, d_women, 1) f1 = np.poly1d(coef_line1) x = np.linspace(1900, 1995, 10) y = f1(x) plt.plot(x, y, "r") #American women coef_line2 = np.polyfit(year, u_women, 1) f2 = np.poly1d(coef_line2) xx = np.linspace(1900, 1995, 10) yy = f2(xx) plt.plot(xx, yy, "b") # Copy from previous question plt.plot(year, u_women, "b*", label = "American Women") plt.plot(year, d_women, "r*", label = "Dutch Women") plt.xlabel("Year") plt.ylabel("Height (cm)") plt.title("Heights of US vs Dutch Women") plt.legend()

# Calculate the variance for each fit; use the function that we wrote in a previous notebook # Input: the x and y arrays for the data points, coefficients of line found using LR # Output: variance # Dutch women def calculate_variance (x, y, coef_line1): n = len(x) degree = len(coef_line1) - 1 var = 0.0 for i in range (0, n): if (degree == 1): y_line = coef_line1[0] * x[i] + coef_line1[1] else: y_line = coef_line1[0] * (x[i]*x[i]) + coef_line1[1] * x[i] + coef_line1[2] y_data = y[i] distance = y_data - y_line var = var + distance * distance var = (var)/float(n) return(var) var = calculate_variance(year, d_women, coef_line1) print("The variance for height in Dutch women is", var, "cm") # American women def calculate_variance (x, y, coef_line2): n = len(x) degree = len(coef_line2) - 1 var = 0.0 for i in range (0, n): if (degree == 1): y_line = coef_line2[0] * x[i] + coef_line2[1] else: y_line = coef_line2[0] * (x[i]*x[i]) + coef_line2[1] * x[i] + coef_line2[2] y_data = y[i] distance = y_data - y_line var = var + distance * distance var = (var)/float(n) return(var) var = calculate_variance(year, u_women, coef_line2) print("The variance for height in American women is", var, "cm")

# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas #Dutch women coef_q1 = np.polyfit(year, d_women, 2) g1 = np.poly1d(coef_q1) x = np.linspace(1900, 2000, 10) y = g1(x) plt.plot(x, y, "r") # Original data points (Dutch) plt.plot(year, d_women,"r*") #American women coef_q2 = np.polyfit(year, u_women, 2) g2 = np.poly1d(coef_q2) xx = np.linspace(1900, 2000, 10) yy = g2(xx) plt.plot(xx, yy, "b") # Original data points (American) plt.plot(year, u_women,"b*") plt.xlabel("Year") plt.ylabel("Height (cm") plt.legend()

# Calculate variance for the quadratic fits # Basically copy from previous variance question but input new variables # Dutch women def calculate_variance (x, y, coef_q1): n = len(x) degree = len(coef_q1) - 1 var = 0.0 for i in range (0, n): if (degree == 1): y_line = coef_q1[0] * x[i] + coef_q1[1] else: y_line = coef_q1[0] * (x[i]*x[i]) + coef_q1[1] * x[i] + coef_q1[2] y_data = y[i] distance = y_data - y_line var = var + distance * distance var = (var)/float(n) return(var) var = calculate_variance(x, y, coef_q1) print("The variance for height in the Dutch women quadratic is", var, "cm") # American women def calculate_variance (x, y, coef_q2): n = len(x) degree = len(coef_q2) - 1 var = 0.0 for i in range (0, n): if (degree == 1): y_line = coef_q2[0] * x[i] + coef_q2[1] else: y_line = coef_q2[0] * (x[i]*x[i]) + coef_q2[1] * x[i] + coef_q2[2] y_data = y[i] distance = y_data - y_line var = var + distance * distance var = (var)/float(n) return(var) var = calculate_variance(xx, yy, coef_q2) print("The variance for height in the American women quadratic is", var, "cm")

# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error; # round values to 2 decimal places #Dutch women 1955 print("Linear:", round(f1(1955), 2)) # Linear version print("Quadratic:", round(g1(1955), 2)) # Quadratic version print("In 1955 Dutch women's average height was 167.11 cm, so the better fit for the prediction is the quadratic") perc_err1 = round(abs((167.24-167.11)/167.11) * 100, 2) # [(Predicted - actual)/(actual)] * 100 print("The percent error is", perc_err1) print("***********************************************************************************************************") #Dutch women 1995 print("Linear:", round(f1(1995), 2)) # Linear version print("Quadratic:", round(g1(1995), 2)) # Quadratic version print("In 1995 Dutch women's average height was 168.73 cm, so the better fit for the prediction is the quadratic") perc_err2 = round(abs((168.87-168.73)/168.73) * 100, 2) # [(Predicted - actual)/(actual)] * 100 print("The percent error is", perc_err2) print("***********************************************************************************************************") #American women 1955 print("Linear:", round(f2(1955), 2)) # Linear version print("Quadratic:", round(g2(1955), 2)) # Quadratic version print("In 1955 American women's average height was 163.54 cm, so the better fit for the prediction is the quadratic") perc_err3 = round(abs((163.46-163.54)/163.54) * 100, 2) # [(Predicted - actual)/(actual)] * 100 print("The percent error is", perc_err3) print("***********************************************************************************************************") #American women 1995 print("Linear:", round(f2(1995), 2)) # Linear version print("Quadratic:", round(g2(1995), 2)) # Quadratic version print("In 1995 American women's average height was 168.73 cm, so the better fit for the prediction is the quadratic") perc_err4 = round(abs((163.63-163.56)/163.56) * 100, 2) # [(Predicted - actual)/(actual)] * 100 print("The percent error is", perc_err4)