Linear Regression on Car Sales

import warnings warnings.filterwarnings('ignore') #importing the libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

cars = pd.read_csv('CarPrice_Assignment.csv') cars.head()

car_IDint64

symbolingint64

cars.shape

cars.describe()

car_IDfloat64

symbolingfloat64

count

205

mean

103

0.8341463415

std

59.32256457

1.245306828

min

-2

25%

50%

103

75%

154

max

205

cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 17  fuelsystem        205 non-null    object 
 18  boreratio         205 non-null    float64
 19  stroke            205 non-null    float64
 20  compressionratio  205 non-null    float64
 21  horsepower        205 non-null    int64  
 22  peakrpm           205 non-null    int64  
 23  citympg           205 non-null    int64  
 24  highwaympg        205 non-null    int64  
 25  price             205 non-null    float64
dtypes: float64(8), int64(8), object(10)
memory usage: 41.8+ KB

# Splitting company name from CarName column CompanyName = cars['CarName'].apply(lambda x : x.split(' ')[0]) cars.insert(3,"CompanyName",CompanyName) cars.drop(['CarName'],axis=1,inplace=True) cars.head()

cars.CompanyName.unique()

cars.CompanyName = cars.CompanyName.str.lower() def replace_name(a,b): cars.CompanyName.replace(a,b,inplace=True) replace_name('alfa-romero','alfa-romeo') replace_name('maxda','mazda') replace_name('porcshce','porsche') replace_name('toyouta','toyota') replace_name('vokswagen','volkswagen') replace_name('vw','volkswagen') cars.CompanyName.unique()

#Checking for duplicates cars.loc[cars.duplicated()]

cars.columns

plt.figure(figsize=(20,8)) plt.subplot(1,2,1) plt.title('Car Price Distribution Plot') sns.distplot(cars.price) plt.subplot(1,2,2) plt.title('Car Price Spread') sns.boxplot(y=cars.price) plt.show()

print(cars.price.describe(percentiles = [0.25,0.50,0.75,0.85,0.90,1]))

count      205.000000
mean     13276.710571
std       7988.852332
min       5118.000000
25%       7788.000000
50%      10295.000000
75%      16503.000000
85%      18500.000000
90%      22563.000000
100%     45400.000000
max      45400.000000
Name: price, dtype: float64

plt.figure(figsize=(25, 6)) plt.subplot(1,3,1) plt1 = cars.CompanyName.value_counts().plot(kind='bar') plt.title('Companies Histogram') plt1.set(xlabel = 'Car company', ylabel='Frequency of company') plt.subplot(1,3,2) plt1 = cars.fueltype.value_counts().plot(kind='bar') plt.title('Fuel Type Histogram') plt1.set(xlabel = 'Fuel Type', ylabel='Frequency of fuel type') plt.subplot(1,3,3) plt1 = cars.carbody.value_counts().plot(kind='bar') plt.title('Car Type Histogram') plt1.set(xlabel = 'Car Type', ylabel='Frequency of Car type') plt.show()

plt.figure(figsize=(20,8)) plt.subplot(1,2,1) plt.title('Symboling Histogram') sns.countplot(cars.symboling) plt.subplot(1,2,2) plt.title('Symboling vs Price') sns.boxplot(x=cars.symboling, y=cars.price) plt.show()

plt.figure(figsize=(20,8)) plt.subplot(1,2,1) plt.title('Engine Type Histogram') sns.countplot(cars.enginetype) plt.subplot(1,2,2) plt.title('Engine Type vs Price') sns.boxplot(x=cars.enginetype, y=cars.price)

df = pd.DataFrame(cars.groupby(['enginetype'])['price'].mean().sort_values(ascending = False)) df.plot.bar(figsize=(8,6)) plt.title('Engine Type vs Average Price') plt.show()

plt.figure(figsize=(25, 6)) df = pd.DataFrame(cars.groupby(['CompanyName'])['price'].mean().sort_values(ascending = False)) df.plot.bar() plt.title('Company Name vs Average Price') df = pd.DataFrame(cars.groupby(['fueltype'])['price'].mean().sort_values(ascending = False)) df.plot.bar() plt.title('Fuel Type vs Average Price') df = pd.DataFrame(cars.groupby(['carbody'])['price'].mean().sort_values(ascending = False)) df.plot.bar() plt.title('Car Type vs Average Price') plt.show()

plt.figure(figsize=(15,5)) plt.subplot(1,2,1) plt.title('Door Number Histogram') sns.countplot(cars.doornumber) plt.subplot(1,2,2) plt.title('Door Number vs Price') sns.boxplot(x=cars.doornumber, y=cars.price) plt.show() plt.figure(figsize=(15,5)) plt.subplot(1,2,1) plt.title('Aspiration Histogram') sns.countplot(cars.aspiration) plt.subplot(1,2,2) plt.title('Aspiration vs Price') sns.boxplot(x=cars.aspiration, y=cars.price) plt.show()

def plot_count(x,fig): plt.subplot(4,2,fig) plt.title(x+' Histogram') sns.countplot(cars[x]) plt.subplot(4,2,(fig+1)) plt.title(x+' vs Price') sns.boxplot(x=cars[x], y=cars.price) plt.figure(figsize=(15,20)) plot_count('enginelocation', 1) plot_count('cylindernumber', 3) plot_count('fuelsystem', 5) plot_count('drivewheel', 7) plt.tight_layout()

def scatter(x,fig): plt.subplot(5,2,fig) plt.scatter(cars[x],cars['price']) plt.title(x+' vs Price') plt.ylabel('Price') plt.xlabel(x) plt.figure(figsize=(10,20)) scatter('carlength', 1) scatter('carwidth', 2) scatter('carheight', 3) scatter('curbweight', 4) plt.tight_layout()

# Derive New field - Fuel economy cars['fueleconomy'] = (0.55 * cars['citympg']) + (0.45 * cars['highwaympg'])

# Binning the Car Companies based on avg prices of each Company. cars['price'] = cars['price'].astype('int') temp = cars.copy() table = temp.groupby(['CompanyName'])['price'].mean() temp = temp.merge(table.reset_index(), how='left',on='CompanyName') bins = [0,10000,20000,40000] cars_bin=['Budget','Medium','Highend'] cars['carsrange'] = pd.cut(temp['price_y'],bins,right=False,labels=cars_bin) cars.head()

plt.figure(figsize=(8,6)) plt.title('Fuel economy vs Price') sns.scatterplot(x=cars['fueleconomy'],y=cars['price'], hue=cars['drivewheel']) plt.xlabel('Fuel Economy') plt.ylabel('Price') plt.show() plt.tight_layout()

cars_lr = cars[['price', 'fueltype', 'aspiration','carbody', 'drivewheel','wheelbase', 'curbweight', 'enginetype', 'cylindernumber', 'enginesize', 'boreratio','horsepower', 'fueleconomy', 'carlength','carwidth', 'carsrange']] cars_lr.head()

sns.pairplot(cars_lr) plt.show()

# Check the corr values of final list of variables cor = cars.corr() cor

# Find out the Fields with high correlation correlated_features = set() for i in range(len(cor.columns)): for j in range(i): if abs(cor.iloc[i, j]) > 0.8: colname1 = cor.columns[i] colname2 = cor.columns[j] print(abs(cor.iloc[i, j]), "--", i, '--', j, '--', colname1, '--', colname2) correlated_features.add(colname1) correlated_features.add(colname2)

0.874587475964264 -- 3 -- 2 -- carlength -- wheelbase
0.8411182684818453 -- 4 -- 3 -- carwidth -- carlength
0.8777284608306433 -- 6 -- 3 -- curbweight -- carlength
0.8670324646791233 -- 6 -- 4 -- curbweight -- carwidth
0.850594073426277 -- 7 -- 6 -- enginesize -- curbweight
0.8097686545377302 -- 11 -- 7 -- horsepower -- enginesize
0.8014561756662708 -- 13 -- 11 -- citympg -- horsepower
0.9713370423425045 -- 14 -- 13 -- highwaympg -- citympg
0.835305437204371 -- 15 -- 6 -- price -- curbweight
0.8741451426986426 -- 15 -- 7 -- price -- enginesize
0.8081381197889799 -- 15 -- 11 -- price -- horsepower
0.9938444201653043 -- 16 -- 13 -- fueleconomy -- citympg
0.9916921560568324 -- 16 -- 14 -- fueleconomy -- highwaympg

print(cor.columns) print('------') print(correlated_features)

Index(['car_ID', 'symboling', 'wheelbase', 'carlength', 'carwidth',
       'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price', 'fueleconomy'],
      dtype='object')
------
{'fueleconomy', 'curbweight', 'carwidth', 'enginesize', 'wheelbase', 'highwaympg', 'price', 'citympg', 'carlength', 'horsepower'}

cor['highwaympg']['citympg']

# Print the Correlation values of the High Correlated fields corh = cars[correlated_features].corr() corh

# Produce a Heatmap plt.figure(figsize=(14,14)) sns.heatmap(corh, annot=True, linewidths=.5, fmt=".2f", cmap="YlGnBu")

print(cars['fueltype'].unique()) print(cars['aspiration'].unique()) print(cars['carbody'].unique()) print(cars['drivewheel'].unique()) print(cars['enginetype'].unique()) print(cars['cylindernumber'].unique()) print(cars['carsrange'].unique()) print(cars['fuelsystem'].unique()) print(cars['CompanyName'].unique()) print(cars['doornumber'].unique()) print(cars['enginelocation'].unique())

['gas' 'diesel']
['std' 'turbo']
['convertible' 'hatchback' 'sedan' 'wagon' 'hardtop']
['rwd' 'fwd' '4wd']
['dohc' 'ohcv' 'ohc' 'l' 'rotor' 'ohcf' 'dohcv']
['four' 'six' 'five' 'three' 'twelve' 'two' 'eight']
['Medium', 'Highend', 'Budget']
Categories (3, object): ['Budget' < 'Medium' < 'Highend']
['mpfi' '2bbl' 'mfi' '1bbl' 'spfi' '4bbl' 'idi' 'spdi']
['alfa-romeo' 'audi' 'bmw' 'chevrolet' 'dodge' 'honda' 'isuzu' 'jaguar'
 'mazda' 'buick' 'mercury' 'mitsubishi' 'nissan' 'peugeot' 'plymouth'
 'porsche' 'renault' 'saab' 'subaru' 'toyota' 'volkswagen' 'volvo']
['two' 'four']
['front' 'rear']

# Defining the map function def dummies(x,df): temp = pd.get_dummies(df[x], drop_first = True) df = pd.concat([df, temp], axis = 1) df.drop([x], axis = 1, inplace = True) return df # Applying the function to the cars_lr cars_lr = cars cars_lr = dummies('fueltype',cars_lr) cars_lr = dummies('aspiration',cars_lr) cars_lr = dummies('carbody',cars_lr) cars_lr = dummies('drivewheel',cars_lr) cars_lr = dummies('enginetype',cars_lr) cars_lr = dummies('cylindernumber',cars_lr) cars_lr = dummies('carsrange',cars_lr) cars_lr = dummies('CompanyName',cars_lr) cars_lr = dummies('doornumber',cars_lr) cars_lr = dummies('enginelocation',cars_lr) cars_lr = dummies('fuelsystem',cars_lr)

cars_lr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 69 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   wheelbase         205 non-null    float64
 3   carlength         205 non-null    float64
 4   carwidth          205 non-null    float64
 5   carheight         205 non-null    float64
 6   curbweight        205 non-null    int64  
 7   enginesize        205 non-null    int64  
 8   boreratio         205 non-null    float64
 9   stroke            205 non-null    float64
 10  compressionratio  205 non-null    float64
 11  horsepower        205 non-null    int64  
 12  peakrpm           205 non-null    int64  
 13  citympg           205 non-null    int64  
 14  highwaympg        205 non-null    int64  
 15  price             205 non-null    int64  
 16  fueleconomy       205 non-null    float64
 17  gas               205 non-null    uint8  
 18  turbo             205 non-null    uint8  
 19  hardtop           205 non-null    uint8  
 20  hatchback         205 non-null    uint8  
 21  sedan             205 non-null    uint8  
 22  wagon             205 non-null    uint8  
 23  fwd               205 non-null    uint8  
 24  rwd               205 non-null    uint8  
 25  dohcv             205 non-null    uint8  
 26  l                 205 non-null    uint8  
 27  ohc               205 non-null    uint8  
 28  ohcf              205 non-null    uint8  
 29  ohcv              205 non-null    uint8  
 30  rotor             205 non-null    uint8  
 31  five              205 non-null    uint8  
 32  four              205 non-null    uint8  
 33  six               205 non-null    uint8  
 34  three             205 non-null    uint8  
 35  twelve            205 non-null    uint8  
 36  two               205 non-null    uint8  
 37  Medium            205 non-null    uint8  
 38  Highend           205 non-null    uint8  
 39  audi              205 non-null    uint8  
 40  bmw               205 non-null    uint8  
 41  buick             205 non-null    uint8  
 42  chevrolet         205 non-null    uint8  
 43  dodge             205 non-null    uint8  
 44  honda             205 non-null    uint8  
 45  isuzu             205 non-null    uint8  
 46  jaguar            205 non-null    uint8  
 47  mazda             205 non-null    uint8  
 48  mercury           205 non-null    uint8  
 49  mitsubishi        205 non-null    uint8  
 50  nissan            205 non-null    uint8  
 51  peugeot           205 non-null    uint8  
 52  plymouth          205 non-null    uint8  
 53  porsche           205 non-null    uint8  
 54  renault           205 non-null    uint8  
 55  saab              205 non-null    uint8  
 56  subaru            205 non-null    uint8  
 57  toyota            205 non-null    uint8  
 58  volkswagen        205 non-null    uint8  
 59  volvo             205 non-null    uint8  
 60  two               205 non-null    uint8  
 61  rear              205 non-null    uint8  
 62  2bbl              205 non-null    uint8  
 63  4bbl              205 non-null    uint8  
 64  idi               205 non-null    uint8  
 65  mfi               205 non-null    uint8  
 66  mpfi              205 non-null    uint8  
 67  spdi              205 non-null    uint8  
 68  spfi              205 non-null    uint8  
dtypes: float64(8), int64(9), uint8(52)
memory usage: 37.8 KB

cars_lr.shape

from sklearn.model_selection import train_test_split np.random.seed(0) df_train, df_test = train_test_split(cars_lr, train_size = 0.7, test_size = 0.3, random_state = 100)

from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() num_vars = ['wheelbase', 'carheight', 'stroke', 'curbweight', 'enginesize', 'boreratio', 'horsepower','fueleconomy','carlength','carwidth','price'] df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

df_train.head()

df_train.describe()

#Dividing data into X and y variables y_train = df_train.pop('price') X_train = df_train

!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.12.2-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)
     |████████████████████████████████| 9.5 MB 27.5 MB/s 
Requirement already satisfied: scipy>=1.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.1)
Requirement already satisfied: numpy>=1.15 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: pandas>=0.21 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Collecting patsy>=0.5
  Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
     |████████████████████████████████| 231 kB 78.2 MB/s 
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.21->statsmodels) (2021.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.21->statsmodels) (2.8.2)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5->statsmodels) (1.16.0)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.12.2
WARNING: You are using pip version 21.2.3; however, version 21.2.4 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.

from sklearn.feature_selection import RFE from sklearn.linear_model import LinearRegression import statsmodels.api as sm from statsmodels.stats.outliers_influence import variance_inflation_factor

def build_model(X,y): X = sm.add_constant(X) # Adding the constant lm = sm.OLS(y,X).fit() # fitting the model print(lm.summary()) # model summary return lm def checkVIF(X): vif = pd.DataFrame() vif['Features'] = X.columns vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] vif['VIF'] = round(vif['VIF'], 2) vif = vif.sort_values(by = "VIF", ascending = False) return(vif)

model1 = build_model(X_train, y_train)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.975
Model:                            OLS   Adj. R-squared:                  0.959
Method:                 Least Squares   F-statistic:                     57.58
Date:                Thu, 16 Sep 2021   Prob (F-statistic):           3.34e-49
Time:                        03:34:46   Log-Likelihood:                 282.04
No. Observations:                 143   AIC:                            -446.1
Df Residuals:                      84   BIC:                            -271.3
Df Model:                          58                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -0.4193      0.205     -2.048      0.044      -0.826      -0.012
car_ID               0.0028      0.002      1.551      0.125      -0.001       0.006
symboling           -0.0005      0.007     -0.072      0.943      -0.015       0.014
wheelbase            0.1787      0.100      1.792      0.077      -0.020       0.377
carlength           -0.1617      0.100     -1.620      0.109      -0.360       0.037
carwidth             0.2345      0.101      2.319      0.023       0.033       0.436
carheight           -0.1306      0.060     -2.175      0.032      -0.250      -0.011
curbweight           0.3391      0.130      2.616      0.011       0.081       0.597
enginesize           2.1169      0.465      4.550      0.000       1.192       3.042
boreratio           -0.6585      0.170     -3.872      0.000      -0.997      -0.320
stroke              -0.2263      0.088     -2.582      0.012      -0.401      -0.052
compressionratio    -0.0245      0.017     -1.453      0.150      -0.058       0.009
horsepower          -0.2414      0.209     -1.154      0.252      -0.657       0.175
peakrpm           8.104e-05   2.08e-05      3.895      0.000    3.97e-05       0.000
citympg             -0.0038      0.004     -0.847      0.399      -0.013       0.005
highwaympg           0.0017      0.004      0.462      0.646      -0.006       0.009
fueleconomy          0.1702      0.083      2.048      0.044       0.005       0.335
gas                 -0.3598      0.099     -3.624      0.000      -0.557      -0.162
turbo                0.0811      0.029      2.801      0.006       0.024       0.139
hardtop             -0.0890      0.057     -1.567      0.121      -0.202       0.024
hatchback           -0.0893      0.045     -1.974      0.052      -0.179       0.001
sedan               -0.0747      0.047     -1.575      0.119      -0.169       0.020
wagon               -0.0617      0.050     -1.232      0.221      -0.161       0.038
fwd                  0.0023      0.025      0.092      0.927      -0.047       0.051
rwd                  0.0305      0.034      0.884      0.379      -0.038       0.099
dohcv                0.2926      0.170      1.717      0.090      -0.046       0.631
l                    0.1209      0.098      1.229      0.222      -0.075       0.317
ohc                  0.0064      0.044      0.144      0.886      -0.082       0.095
ohcf                 0.0982      0.058      1.687      0.095      -0.018       0.214
ohcv                -0.0183      0.039     -0.466      0.642      -0.097       0.060
rotor                0.4402      0.118      3.738      0.000       0.206       0.674
five                 0.3051      0.142      2.142      0.035       0.022       0.588
four                 0.4812      0.184      2.616      0.011       0.115       0.847
six                  0.1399      0.103      1.357      0.178      -0.065       0.345
three                0.6072      0.143      4.257      0.000       0.324       0.891
twelve              -0.3899      0.179     -2.182      0.032      -0.745      -0.035
two                  0.4402      0.118      3.738      0.000       0.206       0.674
Medium               0.2023      0.138      1.461      0.148      -0.073       0.478
Highend              0.1491      0.039      3.839      0.000       0.072       0.226
audi                 0.0657      0.085      0.772      0.442      -0.104       0.235
bmw                  0.3726      0.098      3.800      0.000       0.178       0.568
buick               -0.0577      0.054     -1.061      0.292      -0.166       0.050
chevrolet            0.0787      0.088      0.894      0.374      -0.096       0.254
dodge                0.0198      0.082      0.243      0.809      -0.142       0.182
honda                0.0261      0.074      0.354      0.724      -0.120       0.172
isuzu                0.0685      0.051      1.350      0.181      -0.032       0.169
jaguar              -0.1552      0.101     -1.544      0.126      -0.355       0.045
mazda               -0.1465      0.128     -1.142      0.257      -0.401       0.109
mercury           2.353e-15   5.71e-16      4.123      0.000    1.22e-15    3.49e-15
mitsubishi          -0.1568      0.039     -3.996      0.000      -0.235      -0.079
nissan              -0.2482      0.190     -1.309      0.194      -0.625       0.129
peugeot             -0.4863      0.141     -3.444      0.001      -0.767      -0.205
plymouth            -0.2559      0.106     -2.414      0.018      -0.467      -0.045
porsche             -0.0106      0.130     -0.081      0.936      -0.270       0.249
renault             -0.1552      0.111     -1.403      0.164      -0.375       0.065
saab                -0.1479      0.256     -0.578      0.565      -0.657       0.361
subaru              -0.1568      0.094     -1.671      0.098      -0.343       0.030
toyota              -0.2389      0.174     -1.376      0.172      -0.584       0.106
volkswagen          -0.5013      0.344     -1.458      0.149      -1.185       0.182
volvo               -0.4205      0.352     -1.195      0.236      -1.120       0.279
two                 -0.0099      0.015     -0.656      0.514      -0.040       0.020
rear                 0.2550      0.071      3.593      0.001       0.114       0.396
2bbl                 0.0303      0.056      0.544      0.588      -0.080       0.141
4bbl                -0.0436      0.079     -0.554      0.581      -0.200       0.113
idi                 -0.0595      0.193     -0.308      0.758      -0.443       0.324
mfi                       0          0        nan        nan           0           0
mpfi                -0.0149      0.061     -0.245      0.807      -0.136       0.106
spdi                -0.0103      0.066     -0.156      0.876      -0.141       0.121
spfi                      0          0        nan        nan           0           0
==============================================================================
Omnibus:                       37.897   Durbin-Watson:                   1.918
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              197.279
Skew:                           0.763   Prob(JB):                     1.45e-43
Kurtosis:                       8.548   Cond. No.                     1.01e+16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.76e-23. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

X_train.columns

X_train1 = X_train.drop( ['car_ID', 'wheelbase', 'carlength', 'compressionratio', 'horsepower', 'citympg', 'highwaympg', 'hardtop', 'sedan','wagon','fwd','rwd','dohcv','l','ohc','ohcf','ohcv','six','Medium', 'audi','buick','chevrolet','dodge','honda','isuzu','jaguar','mazda','nissan','porsche','renault','saab', 'subaru','toyota','volkswagen','volvo','two','2bbl','4bbl','idi','mfi','mpfi','spdi'], axis=1)

X_train_df, model2 = build_model(X_train1, y_train)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.956
Model:                            OLS   Adj. R-squared:                  0.947
Method:                 Least Squares   F-statistic:                     111.3
Date:                Thu, 16 Sep 2021   Prob (F-statistic):           2.01e-69
Time:                        03:34:46   Log-Likelihood:                 239.59
No. Observations:                 143   AIC:                            -431.2
Df Residuals:                     119   BIC:                            -360.1
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          -0.4296      0.103     -4.165      0.000      -0.634      -0.225
symboling       0.0046      0.005      0.947      0.346      -0.005       0.014
carwidth        0.3535      0.056      6.297      0.000       0.242       0.465
carheight      -0.0522      0.034     -1.529      0.129      -0.120       0.015
curbweight      0.3176      0.094      3.380      0.001       0.132       0.504
enginesize      1.0516      0.196      5.355      0.000       0.663       1.440
boreratio      -0.2586      0.052     -4.938      0.000      -0.362      -0.155
stroke         -0.2291      0.049     -4.718      0.000      -0.325      -0.133
peakrpm      4.548e-05   1.23e-05      3.702      0.000    2.12e-05    6.98e-05
fueleconomy     0.0473      0.065      0.725      0.470      -0.082       0.176
gas            -0.0267      0.027     -0.993      0.322      -0.080       0.027
turbo           0.0570      0.016      3.628      0.000       0.026       0.088
hatchback      -0.0197      0.011     -1.773      0.079      -0.042       0.002
rotor           0.3480      0.075      4.669      0.000       0.200       0.496
five            0.0853      0.033      2.565      0.012       0.019       0.151
four            0.1651      0.040      4.077      0.000       0.085       0.245
three           0.3599      0.077      4.655      0.000       0.207       0.513
twelve         -0.4233      0.098     -4.316      0.000      -0.617      -0.229
Highend         0.0894      0.029      3.134      0.002       0.033       0.146
bmw             0.2098      0.035      6.019      0.000       0.141       0.279
mercury     -7.209e-16    1.8e-16     -4.010      0.000   -1.08e-15   -3.65e-16
mitsubishi     -0.0832      0.020     -4.069      0.000      -0.124      -0.043
peugeot        -0.1000      0.025     -3.953      0.000      -0.150      -0.050
plymouth       -0.0396      0.027     -1.450      0.150      -0.094       0.014
rear            0.3707      0.063      5.910      0.000       0.247       0.495
spfi                 0          0        nan        nan           0           0
==============================================================================
Omnibus:                       33.213   Durbin-Watson:                   1.818
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              105.032
Skew:                           0.826   Prob(JB):                     1.56e-23
Kurtosis:                       6.860   Cond. No.                     1.01e+16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.76e-23. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

Execution error

TypeError: cannot unpack non-iterable RegressionResultsWrapper object

X_train1.columns

checkVIF(X_train1)

X_train2 = X_train1.drop(['enginesize', 'peakrpm', 'curbweight', 'four', 'boreratio', 'carwidth', 'stroke', 'gas', 'fueleconomy', 'carheight', 'rotor'], axis=1)

model3 = build_model(X_train2, y_train)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.724
Model:                            OLS   Adj. R-squared:                  0.699
Method:                 Least Squares   F-statistic:                     28.43
Date:                Thu, 16 Sep 2021   Prob (F-statistic):           1.10e-30
Time:                        03:46:04   Log-Likelihood:                 109.02
No. Observations:                 143   AIC:                            -192.0
Df Residuals:                     130   BIC:                            -153.5
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1479      0.016      9.528      0.000       0.117       0.179
symboling      0.0053      0.010      0.544      0.587      -0.014       0.025
turbo          0.1087      0.030      3.650      0.000       0.050       0.168
hatchback     -0.0338      0.024     -1.397      0.165      -0.082       0.014
five           0.0703      0.045      1.562      0.121      -0.019       0.159
three         -0.1238      0.120     -1.032      0.304      -0.361       0.114
twelve         0.2017      0.126      1.601      0.112      -0.047       0.451
Highend        0.5036      0.043     11.712      0.000       0.419       0.589
bmw           -0.0530      0.065     -0.819      0.414      -0.181       0.075
mercury    -1.357e-17   2.36e-17     -0.576      0.566   -6.02e-17    3.31e-17
mitsubishi    -0.0876      0.045     -1.951      0.053      -0.176       0.001
peugeot        0.0721      0.047      1.530      0.128      -0.021       0.165
plymouth      -0.1081      0.061     -1.783      0.077      -0.228       0.012
rear           0.1313      0.129      1.020      0.310      -0.123       0.386
spfi                0          0        nan        nan           0           0
==============================================================================
Omnibus:                       13.123   Durbin-Watson:                   2.184
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               14.040
Skew:                           0.680   Prob(JB):                     0.000894
Kurtosis:                       3.710   Cond. No.                          inf
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is      0. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

lm = LinearRegression() lm.fit(X_train1,y_train) rfe = RFE(lm, 10) rfe = rfe.fit(X_train1, y_train)

list(zip(X_train.columns,rfe.support_,rfe.ranking_))

X_train1.columns[rfe.support_]

X_train_rfe = X_train[X_train1.columns[rfe.support_]] X_train_rfe.head()

model4 = build_model(X_train_rfe,y_train)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.926
Model:                            OLS   Adj. R-squared:                  0.920
Method:                 Least Squares   F-statistic:                     164.6
Date:                Thu, 16 Sep 2021   Prob (F-statistic):           1.94e-69
Time:                        03:46:18   Log-Likelihood:                 202.89
No. Observations:                 143   AIC:                            -383.8
Df Residuals:                     132   BIC:                            -351.2
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1329      0.018     -7.455      0.000      -0.168      -0.098
carwidth       0.3184      0.060      5.281      0.000       0.199       0.438
curbweight     0.2935      0.071      4.114      0.000       0.152       0.435
enginesize     0.3779      0.097      3.908      0.000       0.187       0.569
boreratio     -0.0752      0.032     -2.320      0.022      -0.139      -0.011
rotor          0.1252      0.035      3.598      0.000       0.056       0.194
three          0.1475      0.063      2.336      0.021       0.023       0.272
twelve        -0.0632      0.075     -0.848      0.398      -0.211       0.084
Highend        0.1690      0.029      5.902      0.000       0.112       0.226
bmw            0.1178      0.034      3.450      0.001       0.050       0.185
rear           0.3670      0.068      5.397      0.000       0.233       0.502
==============================================================================
Omnibus:                       36.258   Durbin-Watson:                   1.846
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               85.370
Skew:                           1.037   Prob(JB):                     2.90e-19
Kurtosis:                       6.167   Cond. No.                         31.2
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

X_train_new = X_train_rfe.drop(["twelve"], axis = 1)

model5 = build_model(X_train_new,y_train)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.925
Model:                            OLS   Adj. R-squared:                  0.920
Method:                 Least Squares   F-statistic:                     183.2
Date:                Thu, 16 Sep 2021   Prob (F-statistic):           1.99e-70
Time:                        03:46:24   Log-Likelihood:                 202.50
No. Observations:                 143   AIC:                            -385.0
Df Residuals:                     133   BIC:                            -355.4
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1323      0.018     -7.435      0.000      -0.167      -0.097
carwidth       0.3217      0.060      5.351      0.000       0.203       0.441
curbweight     0.3085      0.069      4.466      0.000       0.172       0.445
enginesize     0.3372      0.084      4.021      0.000       0.171       0.503
boreratio     -0.0730      0.032     -2.263      0.025      -0.137      -0.009
rotor          0.1184      0.034      3.501      0.001       0.051       0.185
three          0.1465      0.063      2.323      0.022       0.022       0.271
Highend        0.1697      0.029      5.933      0.000       0.113       0.226
bmw            0.1216      0.034      3.596      0.000       0.055       0.188
rear           0.3758      0.067      5.595      0.000       0.243       0.509
==============================================================================
Omnibus:                       37.099   Durbin-Watson:                   1.859
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               89.139
Skew:                           1.053   Prob(JB):                     4.40e-20
Kurtosis:                       6.244   Cond. No.                         26.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

X_train_new1 = sm.add_constant(X_train_new)

X_train_new1

y_train_price = model5.predict(X_train_new1)

# Plot the histogram of the error terms fig = plt.figure() sns.distplot((y_train - y_train_price), bins = 20) fig.suptitle('Error Terms', fontsize = 20) # Plot heading plt.xlabel('Errors', fontsize = 18)

df_test

# Scaling the test set num_vars = ['carwidth', 'curbweight', 'enginesize', 'boreratio', 'rotor', 'three', 'Highend', 'bmw', 'rear','price'] df_test1 = pd.DataFrame(scaler.fit_transform(df_test[num_vars]), columns=num_vars)

df_test1

#Dividing into X and y y_test = df_test1.pop('price') X_test = df_test1

# Now let's use our model to make predictions. X_test_new = pd.DataFrame(sm.add_constant(X_test))

X_test_new

# Making predictions y_pred = model5.predict(X_test_new)

from sklearn.metrics import r2_score print("Test Prediction R-Sqrd: ", r2_score(y_test, y_pred))

Test Prediction R-Sqrd:  0.8606328550795908

print("Test Prediction R-Sqrd: ", r2_score(y_train, y_train_price))

Test Prediction R-Sqrd:  0.9253685182104617

# Plot the histogram of the error terms fig = plt.figure() sns.distplot((y_test - y_pred), bins = 30) fig.suptitle('Error Terms', fontsize = 20) # Plot heading plt.xlabel('Errors', fontsize = 18)

#EVALUATION OF THE MODEL # Plotting y_test and y_pred to understand the spread. fig = plt.figure() plt.scatter(y_test,y_pred) fig.suptitle('y_test vs y_pred', fontsize=20) # Plot heading plt.xlabel('y_test', fontsize=18) # X-label plt.ylabel('y_pred', fontsize=16)

## Residual Normality residt = model5.resid residt probplot = sm.ProbPlot(residt) plt.figure(figsize=(8,6)) probplot.ppplot(line='45') plt.title('Normal P-P Plot for Regression Standardised Residuals') plt.show()

## Test of Homoscedasticity def get_standard_values (parm): return (parm - parm.mean())/parm.std() plt.scatter(get_standard_values(model5.fittedvalues), get_standard_values(residt)) plt.title('Residual Analysis - Noises') plt.xlabel('Standarddised Predicted Values') plt.ylabel('Standarddised Residuals')

print(model5.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.925
Model:                            OLS   Adj. R-squared:                  0.920
Method:                 Least Squares   F-statistic:                     183.2
Date:                Thu, 16 Sep 2021   Prob (F-statistic):           1.99e-70
Time:                        03:47:32   Log-Likelihood:                 202.50
No. Observations:                 143   AIC:                            -385.0
Df Residuals:                     133   BIC:                            -355.4
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1323      0.018     -7.435      0.000      -0.167      -0.097
carwidth       0.3217      0.060      5.351      0.000       0.203       0.441
curbweight     0.3085      0.069      4.466      0.000       0.172       0.445
enginesize     0.3372      0.084      4.021      0.000       0.171       0.503
boreratio     -0.0730      0.032     -2.263      0.025      -0.137      -0.009
rotor          0.1184      0.034      3.501      0.001       0.051       0.185
three          0.1465      0.063      2.323      0.022       0.022       0.271
Highend        0.1697      0.029      5.933      0.000       0.113       0.226
bmw            0.1216      0.034      3.596      0.000       0.055       0.188
rear           0.3758      0.067      5.595      0.000       0.243       0.509
==============================================================================
Omnibus:                       37.099   Durbin-Watson:                   1.859
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               89.139
Skew:                           1.053   Prob(JB):                     4.40e-20
Kurtosis:                       6.244   Cond. No.                         26.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.