# Import libraries
import numpy as np
import matplotlib.pyplot as plt
# Input exact values for comparison
#
dutch_man_1925 = 174.83; us_man_1925 = 174.53
dutch_man_1955 = 180.23; us_man_1955 = 177.22
dutch_man_1995 = 182.54; us_man_1995 = 177.16
#
dutch_woman_1925 = 162.2; us_woman_1925 = 160.97
dutch_woman_1955 = 167.11; us_woman_1955 = 163.54
dutch_woman_1995 = 168.73; us_woman_1995 = 163.56
filename = "human_heights.txt"
year = np.loadtxt(filename, usecols = 0, skiprows = 2)
d_women = np.loadtxt(filename, usecols = 2, skiprows = 2)
u_women = np.loadtxt(filename, usecols = 4, skiprows = 2)
# Read in the data and plot; add axes labels, plot title and legend
# Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use
# the command plt.legend ()
coef_line1 = np.polyfit(u_women, d_women, 1)
f = np.poly1d(coef_line1)
plt.plot(year, u_women, "b*", label = "American Women")
plt.plot(year, d_women, "r*", label = "Dutch Women")
plt.xlabel("Year")
plt.ylabel("Height (cm)")
plt.title("Heights of US vs Dutch Women")
plt.legend()
# Linear regression fit for both Dutch and U.S.; plot and print out the line
#Dutch women
coef_line1 = np.polyfit(year, d_women, 1)
f1 = np.poly1d(coef_line1)
x = np.linspace(1900, 1995, 10)
y = f1(x)
plt.plot(x, y, "r")
#American women
coef_line2 = np.polyfit(year, u_women, 1)
f2 = np.poly1d(coef_line2)
xx = np.linspace(1900, 1995, 10)
yy = f2(xx)
plt.plot(xx, yy, "b")
# Copy from previous question
plt.plot(year, u_women, "b*", label = "American Women")
plt.plot(year, d_women, "r*", label = "Dutch Women")
plt.xlabel("Year")
plt.ylabel("Height (cm)")
plt.title("Heights of US vs Dutch Women")
plt.legend()
# Calculate the variance for each fit; use the function that we wrote in a previous notebook
# Input: the x and y arrays for the data points, coefficients of line found using LR
# Output: variance
# Dutch women
def calculate_variance (x, y, coef_line1):
n = len(x)
degree = len(coef_line1) - 1
var = 0.0
for i in range (0, n):
if (degree == 1):
y_line = coef_line1[0] * x[i] + coef_line1[1]
else:
y_line = coef_line1[0] * (x[i]*x[i]) + coef_line1[1] * x[i] + coef_line1[2]
y_data = y[i]
distance = y_data - y_line
var = var + distance * distance
var = (var)/float(n)
return(var)
var = calculate_variance(year, d_women, coef_line1)
print("The variance for height in Dutch women is", var, "cm")
# American women
def calculate_variance (x, y, coef_line2):
n = len(x)
degree = len(coef_line2) - 1
var = 0.0
for i in range (0, n):
if (degree == 1):
y_line = coef_line2[0] * x[i] + coef_line2[1]
else:
y_line = coef_line2[0] * (x[i]*x[i]) + coef_line2[1] * x[i] + coef_line2[2]
y_data = y[i]
distance = y_data - y_line
var = var + distance * distance
var = (var)/float(n)
return(var)
var = calculate_variance(year, u_women, coef_line2)
print("The variance for height in American women is", var, "cm")
# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas
#Dutch women
coef_q1 = np.polyfit(year, d_women, 2)
g1 = np.poly1d(coef_q1)
x = np.linspace(1900, 2000, 10)
y = g1(x)
plt.plot(x, y, "r")
# Original data points (Dutch)
plt.plot(year, d_women,"r*")
#American women
coef_q2 = np.polyfit(year, u_women, 2)
g2 = np.poly1d(coef_q2)
xx = np.linspace(1900, 2000, 10)
yy = g2(xx)
plt.plot(xx, yy, "b")
# Original data points (American)
plt.plot(year, u_women,"b*")
plt.xlabel("Year")
plt.ylabel("Height (cm")
plt.legend()
# Calculate variance for the quadratic fits
# Basically copy from previous variance question but input new variables
# Dutch women
def calculate_variance (x, y, coef_q1):
n = len(x)
degree = len(coef_q1) - 1
var = 0.0
for i in range (0, n):
if (degree == 1):
y_line = coef_q1[0] * x[i] + coef_q1[1]
else:
y_line = coef_q1[0] * (x[i]*x[i]) + coef_q1[1] * x[i] + coef_q1[2]
y_data = y[i]
distance = y_data - y_line
var = var + distance * distance
var = (var)/float(n)
return(var)
var = calculate_variance(x, y, coef_q1)
print("The variance for height in the Dutch women quadratic is", var, "cm")
# American women
def calculate_variance (x, y, coef_q2):
n = len(x)
degree = len(coef_q2) - 1
var = 0.0
for i in range (0, n):
if (degree == 1):
y_line = coef_q2[0] * x[i] + coef_q2[1]
else:
y_line = coef_q2[0] * (x[i]*x[i]) + coef_q2[1] * x[i] + coef_q2[2]
y_data = y[i]
distance = y_data - y_line
var = var + distance * distance
var = (var)/float(n)
return(var)
var = calculate_variance(xx, yy, coef_q2)
print("The variance for height in the American women quadratic is", var, "cm")
# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error;
# round values to 2 decimal places
#Dutch women 1955
print("Linear:", round(f1(1955), 2)) # Linear version
print("Quadratic:", round(g1(1955), 2)) # Quadratic version
print("In 1955 Dutch women's average height was 167.11 cm, so the better fit for the prediction is the quadratic")
perc_err1 = round(abs((167.24-167.11)/167.11) * 100, 2) # [(Predicted - actual)/(actual)] * 100
print("The percent error is", perc_err1)
print("***********************************************************************************************************")
#Dutch women 1995
print("Linear:", round(f1(1995), 2)) # Linear version
print("Quadratic:", round(g1(1995), 2)) # Quadratic version
print("In 1995 Dutch women's average height was 168.73 cm, so the better fit for the prediction is the quadratic")
perc_err2 = round(abs((168.87-168.73)/168.73) * 100, 2) # [(Predicted - actual)/(actual)] * 100
print("The percent error is", perc_err2)
print("***********************************************************************************************************")
#American women 1955
print("Linear:", round(f2(1955), 2)) # Linear version
print("Quadratic:", round(g2(1955), 2)) # Quadratic version
print("In 1955 American women's average height was 163.54 cm, so the better fit for the prediction is the quadratic")
perc_err3 = round(abs((163.46-163.54)/163.54) * 100, 2) # [(Predicted - actual)/(actual)] * 100
print("The percent error is", perc_err3)
print("***********************************************************************************************************")
#American women 1995
print("Linear:", round(f2(1995), 2)) # Linear version
print("Quadratic:", round(g2(1995), 2)) # Quadratic version
print("In 1995 American women's average height was 168.73 cm, so the better fit for the prediction is the quadratic")
perc_err4 = round(abs((163.63-163.56)/163.56) * 100, 2) # [(Predicted - actual)/(actual)] * 100
print("The percent error is", perc_err4)