import numpy as np # Multi-dimensional array object
import pandas as pd # Data Manipulation
import seaborn as sns # Data Visualization
import matplotlib.pyplot as plt # Data Visualization
import plotly.express as px # Interactive Data Visualization
# from jupyterthemes import jtplot # Jupyter Notebook Theme
# jtplot.style(theme = 'monokai', context = 'notebook', ticks = True, grid = False)
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # Offline version of the Plotly modules.
df = pd.read_csv('./cars_data.csv')
df.head()
df.columns
df.shape
df.info()
df = df.dropna()
df['MSRP'] = df['MSRP'].str.replace('$','')
df['MSRP'] = df['MSRP'].str.replace(',','')
df['MSRP'] = df['MSRP'].astype(int)
df['Invoice'] = df['Invoice'].str.replace("$", "")
df['Invoice'] = df['Invoice'].str.replace(",", "")
df.head()
df.info()
df.isna().sum()
sns.pairplot(df)
fig = px.histogram(df, x = "Type",
labels = {"Type":"Manufacturer"},
title = "Car Type",
color_discrete_sequence = ["blue"])
fig.show()
df.Origin.unique()
# Plot Location
fig = px.histogram(df, x='Origin',
labels = {"Origin":"Origin"},
title = "Location Of Car Sales",
color_discrete_sequence = ["red"])
fig.show()
# Let's view the drivetrain of the cars
df.DriveTrain.unique()
fig = px.histogram(df, x = "DriveTrain",
labels = {"DriveTrain":"Drivetrain"},
title = "Drivetrain of the car",
color_discrete_sequence = ["BLACK"])
fig.show()
# Plot the make of the car and its location
fig = px.histogram(df, x = "Make",
color = "Origin",
labels = {"Make":"Manufacturer"},
title = "location vs make of the car")
fig.show()
numeric_columns = df[['MSRP', 'Invoice', 'EngineSize', 'Cylinders', 'Horsepower', 'MPG_City', 'MPG_Highway', 'Weight', 'Wheelbase', 'Length']]
correlation_matrix = numeric_columns.corr()
plt.figure(figsize = (18,18))
sns.heatmap(correlation_matrix, cmap="YlGnBu", annot = True)
df.head()
df_dummy = pd.get_dummies(df, columns=["Make", "Model", "Type", "Origin", "DriveTrain"])
df_dummy.head()
df_data = df_dummy.drop(["Invoice"], axis=1)
df_data.head()
# Features to X and Output (MSRP) to y
X = df_data.drop("MSRP", axis=1)
y = df_data["MSRP"]
X = np.array(X)
y = np.array(y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.2)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score
from math import sqrt
lreg_model= LinearRegression()
lreg_model.fit(X_train, y_train)
accuracy_lreg = lreg_model.score(X_test, y_test)
accuracy_lreg
from sklearn.tree import DecisionTreeRegressor
DT_model = DecisionTreeRegressor()
DT_model.fit(X_train, y_train)
DT_accuracy = DT_model.score(X_test, y_test)
DT_accuracy
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor(n_estimators= 5, max_depth= 5)
RF_model.fit(X_train, y_train)
RF_accuracy= RF_model.score(X_test, y_test)
RF_accuracy
!pip install xgboost==2.0.0
from xgboost import XGBRFRegressor
model = XGBRFRegressor()
model.fit(X_train, y_train)
XGBoost_accuracy = model.score(X_test, y_test)
XGBoost_accuracy