import warnings
warnings.filterwarnings('ignore')
#importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
cars = pd.read_csv('CarPrice_Assignment.csv')
cars.head()
car_IDint64
symbolingint64
0
1
3
1
2
3
2
3
1
3
4
2
4
5
2
cars.shape
cars.describe()
car_IDfloat64
symbolingfloat64
count
205
205
mean
103
0.8341463415
std
59.32256457
1.245306828
min
1
-2
25%
52
0
50%
103
1
75%
154
2
max
205
3
cars.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 car_ID 205 non-null int64
1 symboling 205 non-null int64
2 CarName 205 non-null object
3 fueltype 205 non-null object
4 aspiration 205 non-null object
5 doornumber 205 non-null object
6 carbody 205 non-null object
7 drivewheel 205 non-null object
8 enginelocation 205 non-null object
9 wheelbase 205 non-null float64
10 carlength 205 non-null float64
11 carwidth 205 non-null float64
12 carheight 205 non-null float64
13 curbweight 205 non-null int64
14 enginetype 205 non-null object
15 cylindernumber 205 non-null object
16 enginesize 205 non-null int64
17 fuelsystem 205 non-null object
18 boreratio 205 non-null float64
19 stroke 205 non-null float64
20 compressionratio 205 non-null float64
21 horsepower 205 non-null int64
22 peakrpm 205 non-null int64
23 citympg 205 non-null int64
24 highwaympg 205 non-null int64
25 price 205 non-null float64
dtypes: float64(8), int64(8), object(10)
memory usage: 41.8+ KB
# Splitting company name from CarName column
CompanyName = cars['CarName'].apply(lambda x : x.split(' ')[0])
cars.insert(3,"CompanyName",CompanyName)
cars.drop(['CarName'],axis=1,inplace=True)
cars.head()
cars.CompanyName.unique()
cars.CompanyName = cars.CompanyName.str.lower()
def replace_name(a,b):
cars.CompanyName.replace(a,b,inplace=True)
replace_name('alfa-romero','alfa-romeo')
replace_name('maxda','mazda')
replace_name('porcshce','porsche')
replace_name('toyouta','toyota')
replace_name('vokswagen','volkswagen')
replace_name('vw','volkswagen')
cars.CompanyName.unique()
#Checking for duplicates
cars.loc[cars.duplicated()]
cars.columns
plt.figure(figsize=(20,8))
plt.subplot(1,2,1)
plt.title('Car Price Distribution Plot')
sns.distplot(cars.price)
plt.subplot(1,2,2)
plt.title('Car Price Spread')
sns.boxplot(y=cars.price)
plt.show()
print(cars.price.describe(percentiles = [0.25,0.50,0.75,0.85,0.90,1]))
count 205.000000
mean 13276.710571
std 7988.852332
min 5118.000000
25% 7788.000000
50% 10295.000000
75% 16503.000000
85% 18500.000000
90% 22563.000000
100% 45400.000000
max 45400.000000
Name: price, dtype: float64
plt.figure(figsize=(25, 6))
plt.subplot(1,3,1)
plt1 = cars.CompanyName.value_counts().plot(kind='bar')
plt.title('Companies Histogram')
plt1.set(xlabel = 'Car company', ylabel='Frequency of company')
plt.subplot(1,3,2)
plt1 = cars.fueltype.value_counts().plot(kind='bar')
plt.title('Fuel Type Histogram')
plt1.set(xlabel = 'Fuel Type', ylabel='Frequency of fuel type')
plt.subplot(1,3,3)
plt1 = cars.carbody.value_counts().plot(kind='bar')
plt.title('Car Type Histogram')
plt1.set(xlabel = 'Car Type', ylabel='Frequency of Car type')
plt.show()
plt.figure(figsize=(20,8))
plt.subplot(1,2,1)
plt.title('Symboling Histogram')
sns.countplot(cars.symboling)
plt.subplot(1,2,2)
plt.title('Symboling vs Price')
sns.boxplot(x=cars.symboling, y=cars.price)
plt.show()
plt.figure(figsize=(20,8))
plt.subplot(1,2,1)
plt.title('Engine Type Histogram')
sns.countplot(cars.enginetype)
plt.subplot(1,2,2)
plt.title('Engine Type vs Price')
sns.boxplot(x=cars.enginetype, y=cars.price)
df = pd.DataFrame(cars.groupby(['enginetype'])['price'].mean().sort_values(ascending = False))
df.plot.bar(figsize=(8,6))
plt.title('Engine Type vs Average Price')
plt.show()
plt.figure(figsize=(25, 6))
df = pd.DataFrame(cars.groupby(['CompanyName'])['price'].mean().sort_values(ascending = False))
df.plot.bar()
plt.title('Company Name vs Average Price')
df = pd.DataFrame(cars.groupby(['fueltype'])['price'].mean().sort_values(ascending = False))
df.plot.bar()
plt.title('Fuel Type vs Average Price')
df = pd.DataFrame(cars.groupby(['carbody'])['price'].mean().sort_values(ascending = False))
df.plot.bar()
plt.title('Car Type vs Average Price')
plt.show()
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.title('Door Number Histogram')
sns.countplot(cars.doornumber)
plt.subplot(1,2,2)
plt.title('Door Number vs Price')
sns.boxplot(x=cars.doornumber, y=cars.price)
plt.show()
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.title('Aspiration Histogram')
sns.countplot(cars.aspiration)
plt.subplot(1,2,2)
plt.title('Aspiration vs Price')
sns.boxplot(x=cars.aspiration, y=cars.price)
plt.show()
def plot_count(x,fig):
plt.subplot(4,2,fig)
plt.title(x+' Histogram')
sns.countplot(cars[x])
plt.subplot(4,2,(fig+1))
plt.title(x+' vs Price')
sns.boxplot(x=cars[x], y=cars.price)
plt.figure(figsize=(15,20))
plot_count('enginelocation', 1)
plot_count('cylindernumber', 3)
plot_count('fuelsystem', 5)
plot_count('drivewheel', 7)
plt.tight_layout()
def scatter(x,fig):
plt.subplot(5,2,fig)
plt.scatter(cars[x],cars['price'])
plt.title(x+' vs Price')
plt.ylabel('Price')
plt.xlabel(x)
plt.figure(figsize=(10,20))
scatter('carlength', 1)
scatter('carwidth', 2)
scatter('carheight', 3)
scatter('curbweight', 4)
plt.tight_layout()
# Derive New field - Fuel economy
cars['fueleconomy'] = (0.55 * cars['citympg']) + (0.45 * cars['highwaympg'])
# Binning the Car Companies based on avg prices of each Company.
cars['price'] = cars['price'].astype('int')
temp = cars.copy()
table = temp.groupby(['CompanyName'])['price'].mean()
temp = temp.merge(table.reset_index(), how='left',on='CompanyName')
bins = [0,10000,20000,40000]
cars_bin=['Budget','Medium','Highend']
cars['carsrange'] = pd.cut(temp['price_y'],bins,right=False,labels=cars_bin)
cars.head()
plt.figure(figsize=(8,6))
plt.title('Fuel economy vs Price')
sns.scatterplot(x=cars['fueleconomy'],y=cars['price'], hue=cars['drivewheel'])
plt.xlabel('Fuel Economy')
plt.ylabel('Price')
plt.show()
plt.tight_layout()
cars_lr = cars[['price', 'fueltype', 'aspiration','carbody', 'drivewheel','wheelbase',
'curbweight', 'enginetype', 'cylindernumber', 'enginesize', 'boreratio','horsepower',
'fueleconomy', 'carlength','carwidth', 'carsrange']]
cars_lr.head()
sns.pairplot(cars_lr)
plt.show()
# Check the corr values of final list of variables
cor = cars.corr()
cor
# Find out the Fields with high correlation
correlated_features = set()
for i in range(len(cor.columns)):
for j in range(i):
if abs(cor.iloc[i, j]) > 0.8:
colname1 = cor.columns[i]
colname2 = cor.columns[j]
print(abs(cor.iloc[i, j]), "--", i, '--', j, '--', colname1, '--', colname2)
correlated_features.add(colname1)
correlated_features.add(colname2)
0.874587475964264 -- 3 -- 2 -- carlength -- wheelbase
0.8411182684818453 -- 4 -- 3 -- carwidth -- carlength
0.8777284608306433 -- 6 -- 3 -- curbweight -- carlength
0.8670324646791233 -- 6 -- 4 -- curbweight -- carwidth
0.850594073426277 -- 7 -- 6 -- enginesize -- curbweight
0.8097686545377302 -- 11 -- 7 -- horsepower -- enginesize
0.8014561756662708 -- 13 -- 11 -- citympg -- horsepower
0.9713370423425045 -- 14 -- 13 -- highwaympg -- citympg
0.835305437204371 -- 15 -- 6 -- price -- curbweight
0.8741451426986426 -- 15 -- 7 -- price -- enginesize
0.8081381197889799 -- 15 -- 11 -- price -- horsepower
0.9938444201653043 -- 16 -- 13 -- fueleconomy -- citympg
0.9916921560568324 -- 16 -- 14 -- fueleconomy -- highwaympg
print(cor.columns)
print('------')
print(correlated_features)
Index(['car_ID', 'symboling', 'wheelbase', 'carlength', 'carwidth',
'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke',
'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
'price', 'fueleconomy'],
dtype='object')
------
{'fueleconomy', 'curbweight', 'carwidth', 'enginesize', 'wheelbase', 'highwaympg', 'price', 'citympg', 'carlength', 'horsepower'}
cor['highwaympg']['citympg']
# Print the Correlation values of the High Correlated fields
corh = cars[correlated_features].corr()
corh
# Produce a Heatmap
plt.figure(figsize=(14,14))
sns.heatmap(corh, annot=True, linewidths=.5, fmt=".2f", cmap="YlGnBu")
print(cars['fueltype'].unique())
print(cars['aspiration'].unique())
print(cars['carbody'].unique())
print(cars['drivewheel'].unique())
print(cars['enginetype'].unique())
print(cars['cylindernumber'].unique())
print(cars['carsrange'].unique())
print(cars['fuelsystem'].unique())
print(cars['CompanyName'].unique())
print(cars['doornumber'].unique())
print(cars['enginelocation'].unique())
['gas' 'diesel']
['std' 'turbo']
['convertible' 'hatchback' 'sedan' 'wagon' 'hardtop']
['rwd' 'fwd' '4wd']
['dohc' 'ohcv' 'ohc' 'l' 'rotor' 'ohcf' 'dohcv']
['four' 'six' 'five' 'three' 'twelve' 'two' 'eight']
['Medium', 'Highend', 'Budget']
Categories (3, object): ['Budget' < 'Medium' < 'Highend']
['mpfi' '2bbl' 'mfi' '1bbl' 'spfi' '4bbl' 'idi' 'spdi']
['alfa-romeo' 'audi' 'bmw' 'chevrolet' 'dodge' 'honda' 'isuzu' 'jaguar'
'mazda' 'buick' 'mercury' 'mitsubishi' 'nissan' 'peugeot' 'plymouth'
'porsche' 'renault' 'saab' 'subaru' 'toyota' 'volkswagen' 'volvo']
['two' 'four']
['front' 'rear']
# Defining the map function
def dummies(x,df):
temp = pd.get_dummies(df[x], drop_first = True)
df = pd.concat([df, temp], axis = 1)
df.drop([x], axis = 1, inplace = True)
return df
# Applying the function to the cars_lr
cars_lr = cars
cars_lr = dummies('fueltype',cars_lr)
cars_lr = dummies('aspiration',cars_lr)
cars_lr = dummies('carbody',cars_lr)
cars_lr = dummies('drivewheel',cars_lr)
cars_lr = dummies('enginetype',cars_lr)
cars_lr = dummies('cylindernumber',cars_lr)
cars_lr = dummies('carsrange',cars_lr)
cars_lr = dummies('CompanyName',cars_lr)
cars_lr = dummies('doornumber',cars_lr)
cars_lr = dummies('enginelocation',cars_lr)
cars_lr = dummies('fuelsystem',cars_lr)
cars_lr.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 69 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 car_ID 205 non-null int64
1 symboling 205 non-null int64
2 wheelbase 205 non-null float64
3 carlength 205 non-null float64
4 carwidth 205 non-null float64
5 carheight 205 non-null float64
6 curbweight 205 non-null int64
7 enginesize 205 non-null int64
8 boreratio 205 non-null float64
9 stroke 205 non-null float64
10 compressionratio 205 non-null float64
11 horsepower 205 non-null int64
12 peakrpm 205 non-null int64
13 citympg 205 non-null int64
14 highwaympg 205 non-null int64
15 price 205 non-null int64
16 fueleconomy 205 non-null float64
17 gas 205 non-null uint8
18 turbo 205 non-null uint8
19 hardtop 205 non-null uint8
20 hatchback 205 non-null uint8
21 sedan 205 non-null uint8
22 wagon 205 non-null uint8
23 fwd 205 non-null uint8
24 rwd 205 non-null uint8
25 dohcv 205 non-null uint8
26 l 205 non-null uint8
27 ohc 205 non-null uint8
28 ohcf 205 non-null uint8
29 ohcv 205 non-null uint8
30 rotor 205 non-null uint8
31 five 205 non-null uint8
32 four 205 non-null uint8
33 six 205 non-null uint8
34 three 205 non-null uint8
35 twelve 205 non-null uint8
36 two 205 non-null uint8
37 Medium 205 non-null uint8
38 Highend 205 non-null uint8
39 audi 205 non-null uint8
40 bmw 205 non-null uint8
41 buick 205 non-null uint8
42 chevrolet 205 non-null uint8
43 dodge 205 non-null uint8
44 honda 205 non-null uint8
45 isuzu 205 non-null uint8
46 jaguar 205 non-null uint8
47 mazda 205 non-null uint8
48 mercury 205 non-null uint8
49 mitsubishi 205 non-null uint8
50 nissan 205 non-null uint8
51 peugeot 205 non-null uint8
52 plymouth 205 non-null uint8
53 porsche 205 non-null uint8
54 renault 205 non-null uint8
55 saab 205 non-null uint8
56 subaru 205 non-null uint8
57 toyota 205 non-null uint8
58 volkswagen 205 non-null uint8
59 volvo 205 non-null uint8
60 two 205 non-null uint8
61 rear 205 non-null uint8
62 2bbl 205 non-null uint8
63 4bbl 205 non-null uint8
64 idi 205 non-null uint8
65 mfi 205 non-null uint8
66 mpfi 205 non-null uint8
67 spdi 205 non-null uint8
68 spfi 205 non-null uint8
dtypes: float64(8), int64(9), uint8(52)
memory usage: 37.8 KB
cars_lr.shape
from sklearn.model_selection import train_test_split
np.random.seed(0)
df_train, df_test = train_test_split(cars_lr, train_size = 0.7, test_size = 0.3, random_state = 100)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_vars = ['wheelbase', 'carheight', 'stroke', 'curbweight', 'enginesize', 'boreratio', 'horsepower','fueleconomy','carlength','carwidth','price']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
df_train.head()
df_train.describe()
#Dividing data into X and y variables
y_train = df_train.pop('price')
X_train = df_train
!pip install statsmodels
Collecting statsmodels
Downloading statsmodels-0.12.2-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)
|████████████████████████████████| 9.5 MB 27.5 MB/s
Requirement already satisfied: scipy>=1.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.1)
Requirement already satisfied: numpy>=1.15 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: pandas>=0.21 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Collecting patsy>=0.5
Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
|████████████████████████████████| 231 kB 78.2 MB/s
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.21->statsmodels) (2021.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.21->statsmodels) (2.8.2)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5->statsmodels) (1.16.0)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.12.2
WARNING: You are using pip version 21.2.3; however, version 21.2.4 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
def build_model(X,y):
X = sm.add_constant(X) # Adding the constant
lm = sm.OLS(y,X).fit() # fitting the model
print(lm.summary()) # model summary
return lm
def checkVIF(X):
vif = pd.DataFrame()
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
return(vif)
model1 = build_model(X_train, y_train)
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.975
Model: OLS Adj. R-squared: 0.959
Method: Least Squares F-statistic: 57.58
Date: Thu, 16 Sep 2021 Prob (F-statistic): 3.34e-49
Time: 03:34:46 Log-Likelihood: 282.04
No. Observations: 143 AIC: -446.1
Df Residuals: 84 BIC: -271.3
Df Model: 58
Covariance Type: nonrobust
====================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------
const -0.4193 0.205 -2.048 0.044 -0.826 -0.012
car_ID 0.0028 0.002 1.551 0.125 -0.001 0.006
symboling -0.0005 0.007 -0.072 0.943 -0.015 0.014
wheelbase 0.1787 0.100 1.792 0.077 -0.020 0.377
carlength -0.1617 0.100 -1.620 0.109 -0.360 0.037
carwidth 0.2345 0.101 2.319 0.023 0.033 0.436
carheight -0.1306 0.060 -2.175 0.032 -0.250 -0.011
curbweight 0.3391 0.130 2.616 0.011 0.081 0.597
enginesize 2.1169 0.465 4.550 0.000 1.192 3.042
boreratio -0.6585 0.170 -3.872 0.000 -0.997 -0.320
stroke -0.2263 0.088 -2.582 0.012 -0.401 -0.052
compressionratio -0.0245 0.017 -1.453 0.150 -0.058 0.009
horsepower -0.2414 0.209 -1.154 0.252 -0.657 0.175
peakrpm 8.104e-05 2.08e-05 3.895 0.000 3.97e-05 0.000
citympg -0.0038 0.004 -0.847 0.399 -0.013 0.005
highwaympg 0.0017 0.004 0.462 0.646 -0.006 0.009
fueleconomy 0.1702 0.083 2.048 0.044 0.005 0.335
gas -0.3598 0.099 -3.624 0.000 -0.557 -0.162
turbo 0.0811 0.029 2.801 0.006 0.024 0.139
hardtop -0.0890 0.057 -1.567 0.121 -0.202 0.024
hatchback -0.0893 0.045 -1.974 0.052 -0.179 0.001
sedan -0.0747 0.047 -1.575 0.119 -0.169 0.020
wagon -0.0617 0.050 -1.232 0.221 -0.161 0.038
fwd 0.0023 0.025 0.092 0.927 -0.047 0.051
rwd 0.0305 0.034 0.884 0.379 -0.038 0.099
dohcv 0.2926 0.170 1.717 0.090 -0.046 0.631
l 0.1209 0.098 1.229 0.222 -0.075 0.317
ohc 0.0064 0.044 0.144 0.886 -0.082 0.095
ohcf 0.0982 0.058 1.687 0.095 -0.018 0.214
ohcv -0.0183 0.039 -0.466 0.642 -0.097 0.060
rotor 0.4402 0.118 3.738 0.000 0.206 0.674
five 0.3051 0.142 2.142 0.035 0.022 0.588
four 0.4812 0.184 2.616 0.011 0.115 0.847
six 0.1399 0.103 1.357 0.178 -0.065 0.345
three 0.6072 0.143 4.257 0.000 0.324 0.891
twelve -0.3899 0.179 -2.182 0.032 -0.745 -0.035
two 0.4402 0.118 3.738 0.000 0.206 0.674
Medium 0.2023 0.138 1.461 0.148 -0.073 0.478
Highend 0.1491 0.039 3.839 0.000 0.072 0.226
audi 0.0657 0.085 0.772 0.442 -0.104 0.235
bmw 0.3726 0.098 3.800 0.000 0.178 0.568
buick -0.0577 0.054 -1.061 0.292 -0.166 0.050
chevrolet 0.0787 0.088 0.894 0.374 -0.096 0.254
dodge 0.0198 0.082 0.243 0.809 -0.142 0.182
honda 0.0261 0.074 0.354 0.724 -0.120 0.172
isuzu 0.0685 0.051 1.350 0.181 -0.032 0.169
jaguar -0.1552 0.101 -1.544 0.126 -0.355 0.045
mazda -0.1465 0.128 -1.142 0.257 -0.401 0.109
mercury 2.353e-15 5.71e-16 4.123 0.000 1.22e-15 3.49e-15
mitsubishi -0.1568 0.039 -3.996 0.000 -0.235 -0.079
nissan -0.2482 0.190 -1.309 0.194 -0.625 0.129
peugeot -0.4863 0.141 -3.444 0.001 -0.767 -0.205
plymouth -0.2559 0.106 -2.414 0.018 -0.467 -0.045
porsche -0.0106 0.130 -0.081 0.936 -0.270 0.249
renault -0.1552 0.111 -1.403 0.164 -0.375 0.065
saab -0.1479 0.256 -0.578 0.565 -0.657 0.361
subaru -0.1568 0.094 -1.671 0.098 -0.343 0.030
toyota -0.2389 0.174 -1.376 0.172 -0.584 0.106
volkswagen -0.5013 0.344 -1.458 0.149 -1.185 0.182
volvo -0.4205 0.352 -1.195 0.236 -1.120 0.279
two -0.0099 0.015 -0.656 0.514 -0.040 0.020
rear 0.2550 0.071 3.593 0.001 0.114 0.396
2bbl 0.0303 0.056 0.544 0.588 -0.080 0.141
4bbl -0.0436 0.079 -0.554 0.581 -0.200 0.113
idi -0.0595 0.193 -0.308 0.758 -0.443 0.324
mfi 0 0 nan nan 0 0
mpfi -0.0149 0.061 -0.245 0.807 -0.136 0.106
spdi -0.0103 0.066 -0.156 0.876 -0.141 0.121
spfi 0 0 nan nan 0 0
==============================================================================
Omnibus: 37.897 Durbin-Watson: 1.918
Prob(Omnibus): 0.000 Jarque-Bera (JB): 197.279
Skew: 0.763 Prob(JB): 1.45e-43
Kurtosis: 8.548 Cond. No. 1.01e+16
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.76e-23. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
X_train.columns
X_train1 = X_train.drop(
['car_ID', 'wheelbase', 'carlength', 'compressionratio', 'horsepower', 'citympg', 'highwaympg', 'hardtop',
'sedan','wagon','fwd','rwd','dohcv','l','ohc','ohcf','ohcv','six','Medium',
'audi','buick','chevrolet','dodge','honda','isuzu','jaguar','mazda','nissan','porsche','renault','saab',
'subaru','toyota','volkswagen','volvo','two','2bbl','4bbl','idi','mfi','mpfi','spdi'], axis=1)
X_train_df, model2 = build_model(X_train1, y_train)
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.956
Model: OLS Adj. R-squared: 0.947
Method: Least Squares F-statistic: 111.3
Date: Thu, 16 Sep 2021 Prob (F-statistic): 2.01e-69
Time: 03:34:46 Log-Likelihood: 239.59
No. Observations: 143 AIC: -431.2
Df Residuals: 119 BIC: -360.1
Df Model: 23
Covariance Type: nonrobust
===============================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------
const -0.4296 0.103 -4.165 0.000 -0.634 -0.225
symboling 0.0046 0.005 0.947 0.346 -0.005 0.014
carwidth 0.3535 0.056 6.297 0.000 0.242 0.465
carheight -0.0522 0.034 -1.529 0.129 -0.120 0.015
curbweight 0.3176 0.094 3.380 0.001 0.132 0.504
enginesize 1.0516 0.196 5.355 0.000 0.663 1.440
boreratio -0.2586 0.052 -4.938 0.000 -0.362 -0.155
stroke -0.2291 0.049 -4.718 0.000 -0.325 -0.133
peakrpm 4.548e-05 1.23e-05 3.702 0.000 2.12e-05 6.98e-05
fueleconomy 0.0473 0.065 0.725 0.470 -0.082 0.176
gas -0.0267 0.027 -0.993 0.322 -0.080 0.027
turbo 0.0570 0.016 3.628 0.000 0.026 0.088
hatchback -0.0197 0.011 -1.773 0.079 -0.042 0.002
rotor 0.3480 0.075 4.669 0.000 0.200 0.496
five 0.0853 0.033 2.565 0.012 0.019 0.151
four 0.1651 0.040 4.077 0.000 0.085 0.245
three 0.3599 0.077 4.655 0.000 0.207 0.513
twelve -0.4233 0.098 -4.316 0.000 -0.617 -0.229
Highend 0.0894 0.029 3.134 0.002 0.033 0.146
bmw 0.2098 0.035 6.019 0.000 0.141 0.279
mercury -7.209e-16 1.8e-16 -4.010 0.000 -1.08e-15 -3.65e-16
mitsubishi -0.0832 0.020 -4.069 0.000 -0.124 -0.043
peugeot -0.1000 0.025 -3.953 0.000 -0.150 -0.050
plymouth -0.0396 0.027 -1.450 0.150 -0.094 0.014
rear 0.3707 0.063 5.910 0.000 0.247 0.495
spfi 0 0 nan nan 0 0
==============================================================================
Omnibus: 33.213 Durbin-Watson: 1.818
Prob(Omnibus): 0.000 Jarque-Bera (JB): 105.032
Skew: 0.826 Prob(JB): 1.56e-23
Kurtosis: 6.860 Cond. No. 1.01e+16
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.76e-23. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
Execution error
TypeError: cannot unpack non-iterable RegressionResultsWrapper object
X_train1.columns
checkVIF(X_train1)
X_train2 = X_train1.drop(['enginesize', 'peakrpm', 'curbweight', 'four', 'boreratio', 'carwidth', 'stroke',
'gas', 'fueleconomy', 'carheight', 'rotor'], axis=1)
model3 = build_model(X_train2, y_train)
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.724
Model: OLS Adj. R-squared: 0.699
Method: Least Squares F-statistic: 28.43
Date: Thu, 16 Sep 2021 Prob (F-statistic): 1.10e-30
Time: 03:46:04 Log-Likelihood: 109.02
No. Observations: 143 AIC: -192.0
Df Residuals: 130 BIC: -153.5
Df Model: 12
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.1479 0.016 9.528 0.000 0.117 0.179
symboling 0.0053 0.010 0.544 0.587 -0.014 0.025
turbo 0.1087 0.030 3.650 0.000 0.050 0.168
hatchback -0.0338 0.024 -1.397 0.165 -0.082 0.014
five 0.0703 0.045 1.562 0.121 -0.019 0.159
three -0.1238 0.120 -1.032 0.304 -0.361 0.114
twelve 0.2017 0.126 1.601 0.112 -0.047 0.451
Highend 0.5036 0.043 11.712 0.000 0.419 0.589
bmw -0.0530 0.065 -0.819 0.414 -0.181 0.075
mercury -1.357e-17 2.36e-17 -0.576 0.566 -6.02e-17 3.31e-17
mitsubishi -0.0876 0.045 -1.951 0.053 -0.176 0.001
peugeot 0.0721 0.047 1.530 0.128 -0.021 0.165
plymouth -0.1081 0.061 -1.783 0.077 -0.228 0.012
rear 0.1313 0.129 1.020 0.310 -0.123 0.386
spfi 0 0 nan nan 0 0
==============================================================================
Omnibus: 13.123 Durbin-Watson: 2.184
Prob(Omnibus): 0.001 Jarque-Bera (JB): 14.040
Skew: 0.680 Prob(JB): 0.000894
Kurtosis: 3.710 Cond. No. inf
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 0. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
lm = LinearRegression()
lm.fit(X_train1,y_train)
rfe = RFE(lm, 10)
rfe = rfe.fit(X_train1, y_train)
list(zip(X_train.columns,rfe.support_,rfe.ranking_))
X_train1.columns[rfe.support_]
X_train_rfe = X_train[X_train1.columns[rfe.support_]]
X_train_rfe.head()
model4 = build_model(X_train_rfe,y_train)
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.926
Model: OLS Adj. R-squared: 0.920
Method: Least Squares F-statistic: 164.6
Date: Thu, 16 Sep 2021 Prob (F-statistic): 1.94e-69
Time: 03:46:18 Log-Likelihood: 202.89
No. Observations: 143 AIC: -383.8
Df Residuals: 132 BIC: -351.2
Df Model: 10
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -0.1329 0.018 -7.455 0.000 -0.168 -0.098
carwidth 0.3184 0.060 5.281 0.000 0.199 0.438
curbweight 0.2935 0.071 4.114 0.000 0.152 0.435
enginesize 0.3779 0.097 3.908 0.000 0.187 0.569
boreratio -0.0752 0.032 -2.320 0.022 -0.139 -0.011
rotor 0.1252 0.035 3.598 0.000 0.056 0.194
three 0.1475 0.063 2.336 0.021 0.023 0.272
twelve -0.0632 0.075 -0.848 0.398 -0.211 0.084
Highend 0.1690 0.029 5.902 0.000 0.112 0.226
bmw 0.1178 0.034 3.450 0.001 0.050 0.185
rear 0.3670 0.068 5.397 0.000 0.233 0.502
==============================================================================
Omnibus: 36.258 Durbin-Watson: 1.846
Prob(Omnibus): 0.000 Jarque-Bera (JB): 85.370
Skew: 1.037 Prob(JB): 2.90e-19
Kurtosis: 6.167 Cond. No. 31.2
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
X_train_new = X_train_rfe.drop(["twelve"], axis = 1)
model5 = build_model(X_train_new,y_train)
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.925
Model: OLS Adj. R-squared: 0.920
Method: Least Squares F-statistic: 183.2
Date: Thu, 16 Sep 2021 Prob (F-statistic): 1.99e-70
Time: 03:46:24 Log-Likelihood: 202.50
No. Observations: 143 AIC: -385.0
Df Residuals: 133 BIC: -355.4
Df Model: 9
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -0.1323 0.018 -7.435 0.000 -0.167 -0.097
carwidth 0.3217 0.060 5.351 0.000 0.203 0.441
curbweight 0.3085 0.069 4.466 0.000 0.172 0.445
enginesize 0.3372 0.084 4.021 0.000 0.171 0.503
boreratio -0.0730 0.032 -2.263 0.025 -0.137 -0.009
rotor 0.1184 0.034 3.501 0.001 0.051 0.185
three 0.1465 0.063 2.323 0.022 0.022 0.271
Highend 0.1697 0.029 5.933 0.000 0.113 0.226
bmw 0.1216 0.034 3.596 0.000 0.055 0.188
rear 0.3758 0.067 5.595 0.000 0.243 0.509
==============================================================================
Omnibus: 37.099 Durbin-Watson: 1.859
Prob(Omnibus): 0.000 Jarque-Bera (JB): 89.139
Skew: 1.053 Prob(JB): 4.40e-20
Kurtosis: 6.244 Cond. No. 26.6
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
X_train_new1 = sm.add_constant(X_train_new)
X_train_new1
y_train_price = model5.predict(X_train_new1)
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error Terms', fontsize = 20) # Plot heading
plt.xlabel('Errors', fontsize = 18)
df_test
# Scaling the test set
num_vars = ['carwidth', 'curbweight', 'enginesize', 'boreratio', 'rotor', 'three', 'Highend', 'bmw', 'rear','price']
df_test1 = pd.DataFrame(scaler.fit_transform(df_test[num_vars]), columns=num_vars)
df_test1
#Dividing into X and y
y_test = df_test1.pop('price')
X_test = df_test1
# Now let's use our model to make predictions.
X_test_new = pd.DataFrame(sm.add_constant(X_test))
X_test_new
# Making predictions
y_pred = model5.predict(X_test_new)
from sklearn.metrics import r2_score
print("Test Prediction R-Sqrd: ", r2_score(y_test, y_pred))
Test Prediction R-Sqrd: 0.8606328550795908
print("Test Prediction R-Sqrd: ", r2_score(y_train, y_train_price))
Test Prediction R-Sqrd: 0.9253685182104617
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_test - y_pred), bins = 30)
fig.suptitle('Error Terms', fontsize = 20) # Plot heading
plt.xlabel('Errors', fontsize = 18)
#EVALUATION OF THE MODEL
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_pred)
fig.suptitle('y_test vs y_pred', fontsize=20) # Plot heading
plt.xlabel('y_test', fontsize=18) # X-label
plt.ylabel('y_pred', fontsize=16)
## Residual Normality
residt = model5.resid
residt
probplot = sm.ProbPlot(residt)
plt.figure(figsize=(8,6))
probplot.ppplot(line='45')
plt.title('Normal P-P Plot for Regression Standardised Residuals')
plt.show()
## Test of Homoscedasticity
def get_standard_values (parm):
return (parm - parm.mean())/parm.std()
plt.scatter(get_standard_values(model5.fittedvalues), get_standard_values(residt))
plt.title('Residual Analysis - Noises')
plt.xlabel('Standarddised Predicted Values')
plt.ylabel('Standarddised Residuals')
print(model5.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.925
Model: OLS Adj. R-squared: 0.920
Method: Least Squares F-statistic: 183.2
Date: Thu, 16 Sep 2021 Prob (F-statistic): 1.99e-70
Time: 03:47:32 Log-Likelihood: 202.50
No. Observations: 143 AIC: -385.0
Df Residuals: 133 BIC: -355.4
Df Model: 9
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -0.1323 0.018 -7.435 0.000 -0.167 -0.097
carwidth 0.3217 0.060 5.351 0.000 0.203 0.441
curbweight 0.3085 0.069 4.466 0.000 0.172 0.445
enginesize 0.3372 0.084 4.021 0.000 0.171 0.503
boreratio -0.0730 0.032 -2.263 0.025 -0.137 -0.009
rotor 0.1184 0.034 3.501 0.001 0.051 0.185
three 0.1465 0.063 2.323 0.022 0.022 0.271
Highend 0.1697 0.029 5.933 0.000 0.113 0.226
bmw 0.1216 0.034 3.596 0.000 0.055 0.188
rear 0.3758 0.067 5.595 0.000 0.243 0.509
==============================================================================
Omnibus: 37.099 Durbin-Watson: 1.859
Prob(Omnibus): 0.000 Jarque-Bera (JB): 89.139
Skew: 1.053 Prob(JB): 4.40e-20
Kurtosis: 6.244 Cond. No. 26.6
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.