Project 4 - Regression

# Import libraries import numpy as np import matplotlib.pyplot as plt

# Input exact values for comparison # dutch_man_1925 = 174.83; us_man_1925 = 174.53 dutch_man_1955 = 180.23; us_man_1955 = 177.22 dutch_man_1995 = 182.54; us_man_1995 = 177.16 # dutch_woman_1925 = 162.2; us_woman_1925 = 160.97 dutch_woman_1955 = 167.11; us_woman_1955 = 163.54 dutch_woman_1995 = 168.73; us_woman_1995 = 163.56

# Read in the data and plot; add axes labels, plot title and legend # Note: to add legend use label option in plt.plot (e.g., label='Dutch Men') and then use # the command plt.legend () # year = np.loadtxt ('human_heights.txt', usecols=(0), skiprows = 2) dutchmen = np.loadtxt ('human_heights.txt', usecols=(1), skiprows = 2) usmen = np.loadtxt ('human_heights.txt', usecols=(3), skiprows = 2) #print('year', year) #print('dutchmen', dutchmen) #print('usmen', usmen) plt.plot(year, dutchmen, 'bo', label = 'Dutch Men') plt.plot(year, usmen,'ro', label = 'American Men') plt.xlabel("Year") plt.ylabel("Heights in cm") plt.title("Yearly Dutch and American Male Heights") plt.legend() plt.show()

# Linear regression fit for both Dutch and U.S.; plot and print out the line Dutchline_coeff = np.polyfit(year, dutchmen,1) USline_coeff = np.polyfit(year, usmen, 1) f = np.poly1d(Dutchline_coeff) g = np.poly1d(USline_coeff) yeareval = [1900, 1925, 1955, 1985]; Dutcheval = f(yeareval) USeval = g(yeareval) print(f"The equation of the Dutch line using linear regression is {Dutchline_coeff[0]}x \ {Dutchline_coeff[1]}") print(f"The equation of the US line using linear regression is {USline_coeff[0]}x \ +{USline_coeff[1]}") plt.plot(year, dutchmen, 'bo', label = 'Dutch Men') plt.plot(yeareval, Dutcheval,'-b') plt.plot(year, usmen,'ro', label = 'American Men') plt.plot(yeareval, USeval,'r-') plt.xlabel("Year") plt.ylabel("Heights in cm") plt.title("Yearly Dutch and American Male Heights") plt.legend() plt.show()

# Calculate the variance for each fit; use the function that we wrote in a previous notebook # Input our function for calculating the variance def calculate_variance ( x, y,coeff): # # Input: the x and y arrays for the data points, coefficients of line found using LR # # Output: variance # n=len(x) degree = len(coeff) -1 var = 0.0 for i in range(0,n) : if (degree == 1 ) : y_line = coeff[0] * x[i] + coeff[1] # calculate value of y on line for given x[i] else : y_line = coeff[0] * x[i]*x[i] + coeff[1] *x[i] + coeff[2] y_data = y[i] # y height of data point at x[i] distance = y_data-y_line var = var + distance * distance var = ( var )/ float(n) return (var) # # Input: the x and y arrays for the data points, coefficients of line found using LR # # Output: variance # Dutchvarlinear = calculate_variance(year, dutchmen,Dutchline_coeff) USvarlinear = calculate_variance(year, usmen, USline_coeff) print("The variance for a linear fit to the Dutch men data is", Dutchvarlinear) print("The variance for a linear fit to the US men data is", USvarlinear)

# Quadratic regression fit for Dutch and U.S.; plot and print out the parabolas Dutchquad_coeff = np.polyfit(year, dutchmen,2) USquad_coeff = np.polyfit(year, usmen, 2) y = np.poly1d(Dutchquad_coeff) z = np.poly1d(USquad_coeff) Dutchquadeval = y(yeareval) USquadeval = z(yeareval) print(f"The equation for Dutch men using quadratic regression is {Dutchquad_coeff[2]}*x^2 + \ {Dutchquad_coeff[1]}*x + {Dutchquad_coeff[0]}") print(f"The equation for US men using quadratic regression is {USquad_coeff[2]}*x^2 + \ {USquad_coeff[1]}*x + {USquad_coeff[0]}") plt.plot(year, dutchmen, 'bo', label = 'Dutch Men') plt.plot(yeareval, Dutchquadeval,'-b') plt.plot(year, usmen,'ro', label = 'American Men') plt.plot(yeareval, USquadeval,'r-') plt.xlabel("Year") plt.ylabel("Heights in cm") plt.title("Yearly Dutch and American Male Heights") plt.legend() plt.show()

# Calculate variance for the quadratic fits Dutchvarquad = calculate_variance(year, dutchmen,Dutchquad_coeff) USvarquad = calculate_variance(year, usmen, USquad_coeff) print("The variance for a quadratic fit to the Dutch men data is", Dutchvarquad) print("The variance for a quadratic fit to the US men data is", USvarquad)

# Use best fit to predict average heights in 1955 and 1995 for both Dutch and U.S.; compute percent error; # round values to 2 decimal places # Dutchpredictedquad1955 = y(1955) Dutchpredictedquad1995 = y(1995) USpredictedquad1955 = z(1955) USpredictedquad1995 = z(1995) Dutch1955se = (abs(Dutchpredictedquad1955 - dutch_man_1955)) / dutch_man_1955 Dutch1995se = (abs(Dutchpredictedquad1995 - dutch_man_1995)) / dutch_man_1995 us1995se = (abs(USpredictedquad1955 - us_man_1955)) / us_man_1955 us1995se = (abs(USpredictedquad1995 - us_man_1995)) / us_man_1995 print(f"The predicted heights for Dutch men in 1955 is {Dutchpredictedquad1955} and in 1995 is {Dutchpredictedquad1995}") print(f"The predicted heights for American men in 1955 is {USpredictedquad1955} and in 1995 is {USpredictedquad1995}") print(f"The Standard Error for 1955 is {Dutch1955se} for Dutch men and {Dutch1955se} for American men") print(f"The Standard Error for 1955 is {us1995se} for Dutch men and {us1995se} for American men")