wk 5 lecture: permutation, slopes, correlations

# Run this cell to start. import numpy as np import pandas as pd from scipy import optimize as op import random # Safe settings for Pandas. from sklearn.linear_model import LinearRegression pd.set_option('mode.chained_assignment', 'raise') %matplotlib inline from mpl_toolkits.mplot3d import Axes3D import numpy as np import matplotlib.pyplot as plt from matplotlib import cm plt.style.use('fivethirtyeight')

# Get some data from a line and add noise to make realistic linear data def make_a_noisy_line(slope,intercept,x_values,noise): y=np.zeros(len(x_values)) for i in np.arange(0,len(x_values)): y[i] = 2*x_values[i] + intercept + np.random.normal(0,noise) return np.array(y) gen_slope = 8 gen_intercept = 10 noise = 10 number_of_values = 20 x_values = np.array(np.arange(1,number_of_values,1)) y_values = make_a_noisy_line(gen_slope,gen_intercept,x_values,noise) plt.plot(x_values,y_values,'o') real_slope, real_intercept = np.polyfit(x_values, y_values, 1) plt.plot(x_values,real_slope*x_values + real_intercept)

y_shuffled = np.random.permutation(np.array(y_values)) plt.plot(x_values,y_shuffled,'o') m, b = np.polyfit(x_values, y_shuffled, 1) plt.plot(x_values,m*x_values +b)

def sse_for_line(slope_intercept,x_y): slope, intercept = slope_intercept x, y = x_y x=np.array(x) y=np.array(y) predicted_y = (x * slope) + intercept error = predicted_y - y squared_error = error**2 sse = sum(squared_error) return(sse) sse_for_line([slope_guess,intercept_guess],[x_values,y_values])

# Now generate a whole bunch of random slopes number_of_samples = 1000 fake_slopes = np.zeros(number_of_samples) slope_guess=1 # just an initial guess for the slope -- not too important what value this is intercept_guess=0 # just an initial guess for the intercept -- not too important what value this is for i in np.arange(number_of_samples): y_shuffled = np.random.permutation(y_values) current_min = op.minimize(sse_for_line,[slope_guess,intercept_guess],[x_values, y_shuffled]) fake_slopes[i] = current_min.x[0]

# Is the real slope unusual for shuffled numbers? slope_probability = np.count_nonzero(fake_slopes >= real_slope)/len(fake_slopes) print(slope_probability)

# Is the real slope possible by chance? part 2 # Look at real slope (red line) in comparison to blue points plt.hist(fake_slopes) plt.axvline(real_slope,color='red')

hybrid = pd.read_csv('hybrid.csv') suv = hybrid[hybrid['class'] == 'SUV'] suv.plot.scatter('msrp', 'mpg')

plt.scatter(suv['msrp'],suv['mpg']) plt.xlim(0,100000) plt.ylim(0,100000)

plt.scatter(suv['msrp'],suv['mpg']) # put on the best-fit line m, b = np.polyfit(suv['msrp'], suv['mpg'], 1) x_line_values=np.array([np.min(suv['msrp']),np.max(suv['msrp'])]) y_line_values=x_line_values*m + b plt.plot(x_line_values,y_line_values,color="red")

def standard_score(my_values): my_values=np.array(my_values) # to make sure values are in a np.array distances=my_values - np.mean(my_values) s_scores = distances/np.std(my_values) return(s_scores) s_msrp = standard_score(suv['msrp']) s_mpg = standard_score(suv['mpg']) plt.scatter(s_msrp,s_mpg) plt.xlabel('msrp') plt.ylabel('mpg') # put on the best-fit line m, b = np.polyfit(s_msrp, s_mpg, 1) x_line_values=np.array([np.min(s_msrp),np.max(s_msrp)]) y_line_values=x_line_values*m + b plt.plot(x_line_values,y_line_values,color="red")