import warnings
warnings.filterwarnings('ignore')
#importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
cars = pd.read_csv('CarPrice_Assignment.csv')
cars.head()
cars.shape
cars.describe()
cars.info()
# Splitting company name from CarName column
CompanyName = cars['CarName'].apply(lambda x : x.split(' ')[0])
cars.insert(3,"CompanyName",CompanyName)
cars.drop(['CarName'],axis=1,inplace=True)
cars.head()
cars.CompanyName.unique()
cars.CompanyName = cars.CompanyName.str.lower()
def replace_name(a,b):
cars.CompanyName.replace(a,b,inplace=True)
replace_name('alfa-romero','alfa-romeo')
replace_name('maxda','mazda')
replace_name('porcshce','porsche')
replace_name('toyouta','toyota')
replace_name('vokswagen','volkswagen')
replace_name('vw','volkswagen')
cars.CompanyName.unique()
#Checking for duplicates
cars.loc[cars.duplicated()]
cars.columns
plt.figure(figsize=(20,8))
plt.subplot(1,2,1)
plt.title('Car Price Distribution Plot')
sns.distplot(cars.price)
plt.subplot(1,2,2)
plt.title('Car Price Spread')
sns.boxplot(y=cars.price)
plt.show()
print(cars.price.describe(percentiles = [0.25,0.50,0.75,0.85,0.90,1]))
plt.figure(figsize=(25, 6))
plt.subplot(1,3,1)
plt1 = cars.CompanyName.value_counts().plot(kind='bar')
plt.title('Companies Histogram')
plt1.set(xlabel = 'Car company', ylabel='Frequency of company')
plt.subplot(1,3,2)
plt1 = cars.fueltype.value_counts().plot(kind='bar')
plt.title('Fuel Type Histogram')
plt1.set(xlabel = 'Fuel Type', ylabel='Frequency of fuel type')
plt.subplot(1,3,3)
plt1 = cars.carbody.value_counts().plot(kind='bar')
plt.title('Car Type Histogram')
plt1.set(xlabel = 'Car Type', ylabel='Frequency of Car type')
plt.show()
plt.figure(figsize=(20,8))
plt.subplot(1,2,1)
plt.title('Symboling Histogram')
sns.countplot(cars.symboling)
plt.subplot(1,2,2)
plt.title('Symboling vs Price')
sns.boxplot(x=cars.symboling, y=cars.price)
plt.show()
plt.figure(figsize=(20,8))
plt.subplot(1,2,1)
plt.title('Engine Type Histogram')
sns.countplot(cars.enginetype)
plt.subplot(1,2,2)
plt.title('Engine Type vs Price')
sns.boxplot(x=cars.enginetype, y=cars.price)
df = pd.DataFrame(cars.groupby(['enginetype'])['price'].mean().sort_values(ascending = False))
df.plot.bar(figsize=(8,6))
plt.title('Engine Type vs Average Price')
plt.show()
plt.figure(figsize=(25, 6))
df = pd.DataFrame(cars.groupby(['CompanyName'])['price'].mean().sort_values(ascending = False))
df.plot.bar()
plt.title('Company Name vs Average Price')
df = pd.DataFrame(cars.groupby(['fueltype'])['price'].mean().sort_values(ascending = False))
df.plot.bar()
plt.title('Fuel Type vs Average Price')
df = pd.DataFrame(cars.groupby(['carbody'])['price'].mean().sort_values(ascending = False))
df.plot.bar()
plt.title('Car Type vs Average Price')
plt.show()
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.title('Door Number Histogram')
sns.countplot(cars.doornumber)
plt.subplot(1,2,2)
plt.title('Door Number vs Price')
sns.boxplot(x=cars.doornumber, y=cars.price)
plt.show()
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.title('Aspiration Histogram')
sns.countplot(cars.aspiration)
plt.subplot(1,2,2)
plt.title('Aspiration vs Price')
sns.boxplot(x=cars.aspiration, y=cars.price)
plt.show()
def plot_count(x,fig):
plt.subplot(4,2,fig)
plt.title(x+' Histogram')
sns.countplot(cars[x])
plt.subplot(4,2,(fig+1))
plt.title(x+' vs Price')
sns.boxplot(x=cars[x], y=cars.price)
plt.figure(figsize=(15,20))
plot_count('enginelocation', 1)
plot_count('cylindernumber', 3)
plot_count('fuelsystem', 5)
plot_count('drivewheel', 7)
plt.tight_layout()
def scatter(x,fig):
plt.subplot(5,2,fig)
plt.scatter(cars[x],cars['price'])
plt.title(x+' vs Price')
plt.ylabel('Price')
plt.xlabel(x)
plt.figure(figsize=(10,20))
scatter('carlength', 1)
scatter('carwidth', 2)
scatter('carheight', 3)
scatter('curbweight', 4)
plt.tight_layout()
# Derive New field - Fuel economy
cars['fueleconomy'] = (0.55 * cars['citympg']) + (0.45 * cars['highwaympg'])
# Binning the Car Companies based on avg prices of each Company.
cars['price'] = cars['price'].astype('int')
temp = cars.copy()
table = temp.groupby(['CompanyName'])['price'].mean()
temp = temp.merge(table.reset_index(), how='left',on='CompanyName')
bins = [0,10000,20000,40000]
cars_bin=['Budget','Medium','Highend']
cars['carsrange'] = pd.cut(temp['price_y'],bins,right=False,labels=cars_bin)
cars.head()
plt.figure(figsize=(8,6))
plt.title('Fuel economy vs Price')
sns.scatterplot(x=cars['fueleconomy'],y=cars['price'], hue=cars['drivewheel'])
plt.xlabel('Fuel Economy')
plt.ylabel('Price')
plt.show()
plt.tight_layout()
cars_lr = cars[['price', 'fueltype', 'aspiration','carbody', 'drivewheel','wheelbase',
'curbweight', 'enginetype', 'cylindernumber', 'enginesize', 'boreratio','horsepower',
'fueleconomy', 'carlength','carwidth', 'carsrange']]
cars_lr.head()
sns.pairplot(cars_lr)
plt.show()
# Check the corr values of final list of variables
cor = cars.corr()
cor
# Find out the Fields with high correlation
correlated_features = set()
for i in range(len(cor.columns)):
for j in range(i):
if abs(cor.iloc[i, j]) > 0.8:
colname1 = cor.columns[i]
colname2 = cor.columns[j]
print(abs(cor.iloc[i, j]), "--", i, '--', j, '--', colname1, '--', colname2)
correlated_features.add(colname1)
correlated_features.add(colname2)
print(cor.columns)
print('------')
print(correlated_features)
cor['highwaympg']['citympg']
# Print the Correlation values of the High Correlated fields
corh = cars[correlated_features].corr()
corh
# Produce a Heatmap
plt.figure(figsize=(14,14))
sns.heatmap(corh, annot=True, linewidths=.5, fmt=".2f", cmap="YlGnBu")
print(cars['fueltype'].unique())
print(cars['aspiration'].unique())
print(cars['carbody'].unique())
print(cars['drivewheel'].unique())
print(cars['enginetype'].unique())
print(cars['cylindernumber'].unique())
print(cars['carsrange'].unique())
print(cars['fuelsystem'].unique())
print(cars['CompanyName'].unique())
print(cars['doornumber'].unique())
print(cars['enginelocation'].unique())
# Defining the map function
def dummies(x,df):
temp = pd.get_dummies(df[x], drop_first = True)
df = pd.concat([df, temp], axis = 1)
df.drop([x], axis = 1, inplace = True)
return df
# Applying the function to the cars_lr
cars_lr = cars
cars_lr = dummies('fueltype',cars_lr)
cars_lr = dummies('aspiration',cars_lr)
cars_lr = dummies('carbody',cars_lr)
cars_lr = dummies('drivewheel',cars_lr)
cars_lr = dummies('enginetype',cars_lr)
cars_lr = dummies('cylindernumber',cars_lr)
cars_lr = dummies('carsrange',cars_lr)
cars_lr = dummies('CompanyName',cars_lr)
cars_lr = dummies('doornumber',cars_lr)
cars_lr = dummies('enginelocation',cars_lr)
cars_lr = dummies('fuelsystem',cars_lr)
cars_lr.info()
cars_lr.shape
from sklearn.model_selection import train_test_split
np.random.seed(0)
df_train, df_test = train_test_split(cars_lr, train_size = 0.7, test_size = 0.3, random_state = 100)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_vars = ['wheelbase', 'carheight', 'stroke', 'curbweight', 'enginesize', 'boreratio', 'horsepower','fueleconomy','carlength','carwidth','price']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
df_train.head()
df_train.describe()
#Dividing data into X and y variables
y_train = df_train.pop('price')
X_train = df_train
!pip install statsmodels
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
def build_model(X,y):
X = sm.add_constant(X) # Adding the constant
lm = sm.OLS(y,X).fit() # fitting the model
print(lm.summary()) # model summary
return lm
def checkVIF(X):
vif = pd.DataFrame()
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
return(vif)
model1 = build_model(X_train, y_train)
X_train.columns
X_train1 = X_train.drop(
['car_ID', 'wheelbase', 'carlength', 'compressionratio', 'horsepower', 'citympg', 'highwaympg', 'hardtop',
'sedan','wagon','fwd','rwd','dohcv','l','ohc','ohcf','ohcv','six','Medium',
'audi','buick','chevrolet','dodge','honda','isuzu','jaguar','mazda','nissan','porsche','renault','saab',
'subaru','toyota','volkswagen','volvo','two','2bbl','4bbl','idi','mfi','mpfi','spdi'], axis=1)
X_train_df, model2 = build_model(X_train1, y_train)
X_train1.columns
checkVIF(X_train1)
X_train2 = X_train1.drop(['enginesize', 'peakrpm', 'curbweight', 'four', 'boreratio', 'carwidth', 'stroke',
'gas', 'fueleconomy', 'carheight', 'rotor'], axis=1)
model3 = build_model(X_train2, y_train)
lm = LinearRegression()
lm.fit(X_train1,y_train)
rfe = RFE(lm, 10)
rfe = rfe.fit(X_train1, y_train)
list(zip(X_train.columns,rfe.support_,rfe.ranking_))
X_train1.columns[rfe.support_]
X_train_rfe = X_train[X_train1.columns[rfe.support_]]
X_train_rfe.head()
model4 = build_model(X_train_rfe,y_train)
X_train_new = X_train_rfe.drop(["twelve"], axis = 1)
model5 = build_model(X_train_new,y_train)
X_train_new1 = sm.add_constant(X_train_new)
X_train_new1
y_train_price = model5.predict(X_train_new1)
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error Terms', fontsize = 20) # Plot heading
plt.xlabel('Errors', fontsize = 18)
df_test
# Scaling the test set
num_vars = ['carwidth', 'curbweight', 'enginesize', 'boreratio', 'rotor', 'three', 'Highend', 'bmw', 'rear','price']
df_test1 = pd.DataFrame(scaler.fit_transform(df_test[num_vars]), columns=num_vars)
df_test1
#Dividing into X and y
y_test = df_test1.pop('price')
X_test = df_test1
# Now let's use our model to make predictions.
X_test_new = pd.DataFrame(sm.add_constant(X_test))
X_test_new
# Making predictions
y_pred = model5.predict(X_test_new)
from sklearn.metrics import r2_score
print("Test Prediction R-Sqrd: ", r2_score(y_test, y_pred))
print("Test Prediction R-Sqrd: ", r2_score(y_train, y_train_price))
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_test - y_pred), bins = 30)
fig.suptitle('Error Terms', fontsize = 20) # Plot heading
plt.xlabel('Errors', fontsize = 18)
#EVALUATION OF THE MODEL
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_pred)
fig.suptitle('y_test vs y_pred', fontsize=20) # Plot heading
plt.xlabel('y_test', fontsize=18) # X-label
plt.ylabel('y_pred', fontsize=16)
## Residual Normality
residt = model5.resid
residt
probplot = sm.ProbPlot(residt)
plt.figure(figsize=(8,6))
probplot.ppplot(line='45')
plt.title('Normal P-P Plot for Regression Standardised Residuals')
plt.show()
## Test of Homoscedasticity
def get_standard_values (parm):
return (parm - parm.mean())/parm.std()
plt.scatter(get_standard_values(model5.fittedvalues), get_standard_values(residt))
plt.title('Residual Analysis - Noises')
plt.xlabel('Standarddised Predicted Values')
plt.ylabel('Standarddised Residuals')
print(model5.summary())