Project 4 - Regression

# Import libraries import numpy as np import matplotlib.pyplot as plt

# Input exact values for comparison #dutch_man_1925 = 174.83; us_man_1925 = 174.53 #dutch_man_1955 = 180.23; us_man_1955 = 177.22 #dutch_man_1995 = 182.54; us_man_1995 = 177.16 #*************************************************** # I will be comparing the women's heights # I dont understand what you mean by input exact values for comparison #dutch_woman_1925 = 162.2; us_woman_1925 = 160.97 #dutch_woman_1955 = 167.11; us_woman_1955 = 163.54 #dutch_woman_1995 = 168.73; us_woman_1995 = 163.56 filename = 'human_heights.txt' year = np.loadtxt(filename, usecols= 0, skiprows= 2) d_woman = np.loadtxt(filename, usecols=2, skiprows= 2) u_woman = np.loadtxt(filename, usecols= 4, skiprows= 2) print(d_woman) print(u_woman)

[155.82 158.31 160.99 163.28 165.2  166.45 167.84 168.85 168.94 168.8 ]
[158.74 159.34 160.32 161.69 162.91 163.36 163.76 164.02 163.84 163.65]

# Read in the data and plot; add axes labels, plot title and legend # Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use # the command plt.legend () coef_line1 = np.polyfit(u_woman, d_woman, 1) f = np.poly1d(coef_line1) plt.plot(year, u_woman, 'bo', label = 'American Women') plt.plot(year, d_woman, 'ro', label = 'Dutch Women') plt.xlabel('Year') plt.ylabel('Height in Centimeters') plt.title('Dutch vs. American Women') plt.legend()

# Linear regression fit for both Dutch and U.S.; plot and print out the line coef_line1 = np.polyfit(year, d_woman, 1) f1 = np.poly1d(coef_line1) x = np.linspace(1900, 1995, 10) y = f1(x) plt.plot(x, y, 'r') #plt.show() coef_line2 = np.polyfit(year, u_woman, 1) f2 = np.poly1d(coef_line2) xx = np.linspace(1900, 1995, 10) yy = f2(xx) plt.plot(xx, yy, 'b') #plt.show() plt.plot(year, u_woman, 'bo', label = 'American Women') plt.plot(year, d_woman, 'ro', label = 'Dutch Women') plt.xlabel('Year') plt.ylabel('Height in Centimeters') plt.title('Dutch vs. American Women') plt.legend()

# Calculate the variance for each fit; use the function that we wrote in a previous notebook # Input: the x and y arrays for the data points, coefficients of line found using LR # Output: variance def calculate_variance (x, y,coef_line1): n=len(x) degree = len(coef_line1) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coef_line1[0] * x[i] + coef_line1[1] # calculate value of y on line for given x[i] else : y_line = coef_line1[0] * x[i]*x[i] + coef_line1[1] *x[i] + coef_line1[2] y_data = y[i] # y height of data point at x[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) var_line = calculate_variance (x, y, coef_line1) print('Variance for the Dutch Women line is', var_line) def calculate_variance (xx, yy,coef_line2): n=len(xx) degree = len(coef_line2) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coef_line2[0] * xx[i] + coef_line2[1] # calculate value of y on line for given x[i] else : y_line = coef_line2[0] * xx[i]*xx[i] + coef_line2[1] *xx[i] + coef_line2[2] y_data = yy[i] # y height of data point at x[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) var_line2 = calculate_variance (xx, yy, coef_line2) print('Variance for the American Women line is', var_line2) #neither of these look right but I dont know how to fix it #nvrm got it to work!

Variance for the Dutch Women line is 3.0999318281535304
Variance for the American Women line is 0.8606209933016302

# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas coef_q1 = np.polyfit(year, d_woman, 2) g1 = np.poly1d(coef_q1) x = np.linspace(1900, 2000, 10) y = g1(x) plt.plot(x, y, 'r') #plt.show() coef_q2 = np.polyfit(year, u_woman, 2) g2 = np.poly1d(coef_q2) xx = np.linspace(1900, 2000, 10) yy = g2(xx) plt.plot(xx, yy, 'b') #plt.show() plt.plot(year, u_woman, 'bo', label = 'American Women') plt.plot(year, d_woman, 'ro', label = 'Dutch Women')

# Calculate variance for the quadratic fits def calculate_variance (x, y,coef_q1): n=len(x) degree = len(coef_q1) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coef_q1[0] * x[i] + coef_q1[1] # calculate value of y on line for given x[i] else : y_line = coef_q1[0] * x[i]*x[i] + coef_q1[1] *x[i] + coef_q1[2] y_data = y[i] # y height of data point at x[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) var_q = calculate_variance (x, y, coef_q1) print('Variance for the Dutch Women quadratic is', var_q) def calculate_variance (xx, yy,coef_q2): n=len(xx) degree = len(coef_q2) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coef_q2[0] * xx[i] + coef_q2[1] # calculate value of y on line for given x[i] else : y_line = coef_q2[0] * xx[i]*xx[i] + coef_q2[1] *xx[i] + coef_q2[2] y_data = yy[i] # y height of data point at x[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) var_q2 = calculate_variance (xx, yy, coef_q2) print('Variance for the American Women quadratic is', var_q2) #really accurate results!

Variance for the Dutch Women quadratic is 3.308722450212111e-25
Variance for the American Women quadratic is 1.0339757656912846e-25

# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error; # round values to 2 decimal places print(f1(1925)) print(g1(1925)) print('In 1925,the Dutch women averaged at 162.2, therefore the most accurate fit is the quadratic with the value', g1(1925)) print('***************************************************') print(f1(1995)) print(g1(1995)) print('In 1955,the Dutch women averaged at 168.73, therefore the most accurate fit is the quadratic with the value', g1(1995)) print('***************************************************') print(f2(1925)) print(g2(1925)) print('In 1925,the American women averaged at 160.97, therefore the most accurate fit is the linear with the value', f2(1925)) print('***************************************************') print(f2(1995)) print(g2(1995)) print('In 1955,the American women averaged at 163.56, therefore the most accurate fit is the quadratic with the value', g2(1995)) print('***************************************************') #I dont know how to calculate the percent error

161.4727272727274
162.23869318181823
In 1925,the Dutch women averaged at 162.2, therefore the most accurate fit is the quadratic with the value 162.23869318181823
***************************************************
171.8861818181819
168.8673749999989
In 1955,the Dutch women averaged at 168.73, therefore the most accurate fit is the quadratic with the value 168.8673749999989
***************************************************
160.94057575757571
161.34416477272816
In 1925,the American women averaged at 160.97, therefore the most accurate fit is the linear with the value 160.94057575757571
***************************************************
165.2190606060606
163.62844507575755
In 1955,the American women averaged at 163.56, therefore the most accurate fit is the quadratic with the value 163.62844507575755
***************************************************