# Run this cell to start.
import numpy as np
import pandas as pd
from scipy import optimize as op
import random
# Safe settings for Pandas.
from sklearn.linear_model import LinearRegression
pd.set_option('mode.chained_assignment', 'raise')
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
plt.style.use('fivethirtyeight')
# Get some data from a line and add noise to make realistic linear data
def make_a_noisy_line(slope,intercept,x_values,noise):
y=np.zeros(len(x_values))
for i in np.arange(0,len(x_values)):
y[i] = 2*x_values[i] + intercept + np.random.normal(0,noise)
return np.array(y)
gen_slope = 8
gen_intercept = 10
noise = 10
number_of_values = 20
x_values = np.array(np.arange(1,number_of_values,1))
y_values = make_a_noisy_line(gen_slope,gen_intercept,x_values,noise)
plt.plot(x_values,y_values,'o')
real_slope, real_intercept = np.polyfit(x_values, y_values, 1)
plt.plot(x_values,real_slope*x_values + real_intercept)
y_shuffled = np.random.permutation(np.array(y_values))
plt.plot(x_values,y_shuffled,'o')
m, b = np.polyfit(x_values, y_shuffled, 1)
plt.plot(x_values,m*x_values +b)
def sse_for_line(slope_intercept,x_y):
slope, intercept = slope_intercept
x, y = x_y
x=np.array(x)
y=np.array(y)
predicted_y = (x * slope) + intercept
error = predicted_y - y
squared_error = error**2
sse = sum(squared_error)
return(sse)
sse_for_line([slope_guess,intercept_guess],[x_values,y_values])
# Now generate a whole bunch of random slopes
number_of_samples = 1000
fake_slopes = np.zeros(number_of_samples)
slope_guess=1 # just an initial guess for the slope -- not too important what value this is
intercept_guess=0 # just an initial guess for the intercept -- not too important what value this is
for i in np.arange(number_of_samples):
y_shuffled = np.random.permutation(y_values)
current_min = op.minimize(sse_for_line,[slope_guess,intercept_guess],[x_values, y_shuffled])
fake_slopes[i] = current_min.x[0]
# Is the real slope unusual for shuffled numbers?
slope_probability = np.count_nonzero(fake_slopes >= real_slope)/len(fake_slopes)
print(slope_probability)
# Is the real slope possible by chance? part 2
# Look at real slope (red line) in comparison to blue points
plt.hist(fake_slopes)
plt.axvline(real_slope,color='red')
hybrid = pd.read_csv('hybrid.csv')
suv = hybrid[hybrid['class'] == 'SUV']
suv.plot.scatter('msrp', 'mpg')
plt.scatter(suv['msrp'],suv['mpg'])
plt.xlim(0,100000)
plt.ylim(0,100000)
plt.scatter(suv['msrp'],suv['mpg'])
# put on the best-fit line
m, b = np.polyfit(suv['msrp'], suv['mpg'], 1)
x_line_values=np.array([np.min(suv['msrp']),np.max(suv['msrp'])])
y_line_values=x_line_values*m + b
plt.plot(x_line_values,y_line_values,color="red")
def standard_score(my_values):
my_values=np.array(my_values) # to make sure values are in a np.array
distances=my_values - np.mean(my_values)
s_scores = distances/np.std(my_values)
return(s_scores)
s_msrp = standard_score(suv['msrp'])
s_mpg = standard_score(suv['mpg'])
plt.scatter(s_msrp,s_mpg)
plt.xlabel('msrp')
plt.ylabel('mpg')
# put on the best-fit line
m, b = np.polyfit(s_msrp, s_mpg, 1)
x_line_values=np.array([np.min(s_msrp),np.max(s_msrp)])
y_line_values=x_line_values*m + b
plt.plot(x_line_values,y_line_values,color="red")