# Import libraries
import numpy as np
import matplotlib.pyplot as plt
# Input exact values for comparison
#dutch_man_1925 = 174.83; us_man_1925 = 174.53
#dutch_man_1955 = 180.23; us_man_1955 = 177.22
#dutch_man_1995 = 182.54; us_man_1995 = 177.16
#***************************************************
# I will be comparing the women's heights
# I dont understand what you mean by input exact values for comparison
#dutch_woman_1925 = 162.2; us_woman_1925 = 160.97
#dutch_woman_1955 = 167.11; us_woman_1955 = 163.54
#dutch_woman_1995 = 168.73; us_woman_1995 = 163.56
filename = 'human_heights.txt'
year = np.loadtxt(filename, usecols= 0, skiprows= 2)
d_woman = np.loadtxt(filename, usecols=2, skiprows= 2)
u_woman = np.loadtxt(filename, usecols= 4, skiprows= 2)
print(d_woman)
print(u_woman)
[155.82 158.31 160.99 163.28 165.2 166.45 167.84 168.85 168.94 168.8 ]
[158.74 159.34 160.32 161.69 162.91 163.36 163.76 164.02 163.84 163.65]
# Read in the data and plot; add axes labels, plot title and legend
# Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use
# the command plt.legend ()
coef_line1 = np.polyfit(u_woman, d_woman, 1)
f = np.poly1d(coef_line1)
plt.plot(year, u_woman, 'bo', label = 'American Women')
plt.plot(year, d_woman, 'ro', label = 'Dutch Women')
plt.xlabel('Year')
plt.ylabel('Height in Centimeters')
plt.title('Dutch vs. American Women')
plt.legend()
# Linear regression fit for both Dutch and U.S.; plot and print out the line
coef_line1 = np.polyfit(year, d_woman, 1)
f1 = np.poly1d(coef_line1)
x = np.linspace(1900, 1995, 10)
y = f1(x)
plt.plot(x, y, 'r')
#plt.show()
coef_line2 = np.polyfit(year, u_woman, 1)
f2 = np.poly1d(coef_line2)
xx = np.linspace(1900, 1995, 10)
yy = f2(xx)
plt.plot(xx, yy, 'b')
#plt.show()
plt.plot(year, u_woman, 'bo', label = 'American Women')
plt.plot(year, d_woman, 'ro', label = 'Dutch Women')
plt.xlabel('Year')
plt.ylabel('Height in Centimeters')
plt.title('Dutch vs. American Women')
plt.legend()
# Calculate the variance for each fit; use the function that we wrote in a previous notebook
# Input: the x and y arrays for the data points, coefficients of line found using LR
# Output: variance
def calculate_variance (x, y,coef_line1):
n=len(x)
degree = len(coef_line1) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coef_line1[0] * x[i] + coef_line1[1] # calculate value of y on line for given x[i]
else :
y_line = coef_line1[0] * x[i]*x[i] + coef_line1[1] *x[i] + coef_line1[2]
y_data = y[i] # y height of data point at x[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
var_line = calculate_variance (x, y, coef_line1)
print('Variance for the Dutch Women line is', var_line)
def calculate_variance (xx, yy,coef_line2):
n=len(xx)
degree = len(coef_line2) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coef_line2[0] * xx[i] + coef_line2[1] # calculate value of y on line for given x[i]
else :
y_line = coef_line2[0] * xx[i]*xx[i] + coef_line2[1] *xx[i] + coef_line2[2]
y_data = yy[i] # y height of data point at x[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
var_line2 = calculate_variance (xx, yy, coef_line2)
print('Variance for the American Women line is', var_line2)
#neither of these look right but I dont know how to fix it
#nvrm got it to work!
Variance for the Dutch Women line is 3.0999318281535304
Variance for the American Women line is 0.8606209933016302
# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas
coef_q1 = np.polyfit(year, d_woman, 2)
g1 = np.poly1d(coef_q1)
x = np.linspace(1900, 2000, 10)
y = g1(x)
plt.plot(x, y, 'r')
#plt.show()
coef_q2 = np.polyfit(year, u_woman, 2)
g2 = np.poly1d(coef_q2)
xx = np.linspace(1900, 2000, 10)
yy = g2(xx)
plt.plot(xx, yy, 'b')
#plt.show()
plt.plot(year, u_woman, 'bo', label = 'American Women')
plt.plot(year, d_woman, 'ro', label = 'Dutch Women')
# Calculate variance for the quadratic fits
def calculate_variance (x, y,coef_q1):
n=len(x)
degree = len(coef_q1) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coef_q1[0] * x[i] + coef_q1[1] # calculate value of y on line for given x[i]
else :
y_line = coef_q1[0] * x[i]*x[i] + coef_q1[1] *x[i] + coef_q1[2]
y_data = y[i] # y height of data point at x[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
var_q = calculate_variance (x, y, coef_q1)
print('Variance for the Dutch Women quadratic is', var_q)
def calculate_variance (xx, yy,coef_q2):
n=len(xx)
degree = len(coef_q2) -1
var = 0.0
for i in range(0,n) :
if (degree == 1 ) :
y_line = coef_q2[0] * xx[i] + coef_q2[1] # calculate value of y on line for given x[i]
else :
y_line = coef_q2[0] * xx[i]*xx[i] + coef_q2[1] *xx[i] + coef_q2[2]
y_data = yy[i] # y height of data point at x[i]
distance = y_data-y_line
var = var + distance * distance
var = ( var )/ float(n)
return (var)
var_q2 = calculate_variance (xx, yy, coef_q2)
print('Variance for the American Women quadratic is', var_q2)
#really accurate results!
Variance for the Dutch Women quadratic is 3.308722450212111e-25
Variance for the American Women quadratic is 1.0339757656912846e-25
# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error;
# round values to 2 decimal places
print(f1(1925))
print(g1(1925))
print('In 1925,the Dutch women averaged at 162.2, therefore the most accurate fit is the quadratic with the value', g1(1925))
print('***************************************************')
print(f1(1995))
print(g1(1995))
print('In 1955,the Dutch women averaged at 168.73, therefore the most accurate fit is the quadratic with the value', g1(1995))
print('***************************************************')
print(f2(1925))
print(g2(1925))
print('In 1925,the American women averaged at 160.97, therefore the most accurate fit is the linear with the value', f2(1925))
print('***************************************************')
print(f2(1995))
print(g2(1995))
print('In 1955,the American women averaged at 163.56, therefore the most accurate fit is the quadratic with the value', g2(1995))
print('***************************************************')
#I dont know how to calculate the percent error
161.4727272727274
162.23869318181823
In 1925,the Dutch women averaged at 162.2, therefore the most accurate fit is the quadratic with the value 162.23869318181823
***************************************************
171.8861818181819
168.8673749999989
In 1955,the Dutch women averaged at 168.73, therefore the most accurate fit is the quadratic with the value 168.8673749999989
***************************************************
160.94057575757571
161.34416477272816
In 1925,the American women averaged at 160.97, therefore the most accurate fit is the linear with the value 160.94057575757571
***************************************************
165.2190606060606
163.62844507575755
In 1955,the American women averaged at 163.56, therefore the most accurate fit is the quadratic with the value 163.62844507575755
***************************************************