XGBoost: Used Cars Price Prediction

import numpy as np # Multi-dimensional array object import pandas as pd # Data Manipulation import seaborn as sns # Data Visualization import matplotlib.pyplot as plt # Data Visualization import plotly.express as px # Interactive Data Visualization # from jupyterthemes import jtplot # Jupyter Notebook Theme # jtplot.style(theme = 'monokai', context = 'notebook', ticks = True, grid = False) # from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # Offline version of the Plotly modules.

df = pd.read_csv('./cars_data.csv') df.head()

df.columns

df.shape

df.info()

df = df.dropna()

df['MSRP'] = df['MSRP'].str.replace('$','') df['MSRP'] = df['MSRP'].str.replace(',','') df['MSRP'] = df['MSRP'].astype(int) df['Invoice'] = df['Invoice'].str.replace("$", "") df['Invoice'] = df['Invoice'].str.replace(",", "") df.head()

df.info()

df.isna().sum()

sns.pairplot(df)

fig = px.histogram(df, x = "Type", labels = {"Type":"Manufacturer"}, title = "Car Type", color_discrete_sequence = ["blue"]) fig.show()

df.Origin.unique()

# Plot Location fig = px.histogram(df, x='Origin', labels = {"Origin":"Origin"}, title = "Location Of Car Sales", color_discrete_sequence = ["red"]) fig.show()

# Let's view the drivetrain of the cars df.DriveTrain.unique()

fig = px.histogram(df, x = "DriveTrain", labels = {"DriveTrain":"Drivetrain"}, title = "Drivetrain of the car", color_discrete_sequence = ["BLACK"]) fig.show()

# Plot the make of the car and its location fig = px.histogram(df, x = "Make", color = "Origin", labels = {"Make":"Manufacturer"}, title = "location vs make of the car") fig.show()

numeric_columns = df[['MSRP', 'Invoice', 'EngineSize', 'Cylinders', 'Horsepower', 'MPG_City', 'MPG_Highway', 'Weight', 'Wheelbase', 'Length']] correlation_matrix = numeric_columns.corr() plt.figure(figsize = (18,18)) sns.heatmap(correlation_matrix, cmap="YlGnBu", annot = True)

df.head()

df_dummy = pd.get_dummies(df, columns=["Make", "Model", "Type", "Origin", "DriveTrain"]) df_dummy.head()

df_data = df_dummy.drop(["Invoice"], axis=1) df_data.head()

# Features to X and Output (MSRP) to y X = df_data.drop("MSRP", axis=1) y = df_data["MSRP"]

X = np.array(X) y = np.array(y)

from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.2)

from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score from math import sqrt lreg_model= LinearRegression() lreg_model.fit(X_train, y_train) accuracy_lreg = lreg_model.score(X_test, y_test) accuracy_lreg

from sklearn.tree import DecisionTreeRegressor DT_model = DecisionTreeRegressor() DT_model.fit(X_train, y_train) DT_accuracy = DT_model.score(X_test, y_test) DT_accuracy

from sklearn.ensemble import RandomForestRegressor RF_model = RandomForestRegressor(n_estimators= 5, max_depth= 5) RF_model.fit(X_train, y_train)

RF_accuracy= RF_model.score(X_test, y_test) RF_accuracy

!pip install xgboost==2.0.0

from xgboost import XGBRFRegressor

model = XGBRFRegressor() model.fit(X_train, y_train)

XGBoost_accuracy = model.score(X_test, y_test) XGBoost_accuracy