import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Update pandas to display all columns in the output
pd.options.display.max_columns = None
data_path = '/kaggle/input/car-price-prediction/CarPrice_Assignment.csv'
data_path
df = pd.read_csv(data_path)
df.head()
df.shape
# Check column names and data types in the dataset
df.info()
# Let's see unique values in each columns
df.nunique()
# Let's describe the data for numerical columns
df.describe()
# Describe data for categorical columns
df.describe(include='object')
# Check for duplicates
df.duplicated().sum()
df.drop('car_ID', axis=1, inplace=True)
df.shape
df['CarName'].head(10)
df['CarBrand'] = df.CarName.apply(lambda s: s.split()[0])
df["CarBrand"].value_counts()
brand_map = {
'toyouta':'toyota',
'Nissan':'nissan',
'maxda':'mazda',
'vokswagen':'volkswagen',
'vw':'volkswagen',
'porcshce':'porsche'
}
df['CarBrand'] = df.CarBrand.apply(lambda s: brand_map[s] if s in brand_map else s)
df['CarBrand'].value_counts()
df.drop('CarName', axis=1, inplace=True)
df.head()
# Check if there are any missing values
df.isna().sum()
df.loc[:,df.dtypes=='object'].head()
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.histplot(df['price'])
sns.boxplot(x=df['price'])
df['price'].describe()
df.columns
sns.countplot(df['symboling'])
sns.boxplot(y='price', x='symboling', data=df)
sns.boxplot(y='price', x='fueltype', data=df)
sns.boxplot(y='price', x='aspiration', data=df)
plt.figure(figsize=(12,8))
corr = df.corr()
sns.heatmap(corr, annot=True)
cat_cols = df.columns[df.dtypes.isin(['object', 'int64'])]
cat_cols
numerical_cols = df.columns[df.dtypes == 'float64']
numerical_cols
print(f'Total categorical columns: {len(cat_cols)}')
ncols = 4
nrows = int(np.ceil(len(cat_cols) / ncols))
total_cols = len(cat_cols)
print(total_cols, nrows, ncols)
# plt.figure(figsize=(20,20))
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, sharey = True, figsize=(20,20))
for i, col in enumerate(cat_cols):
row_index = i // ncols
col_index = i % ncols
# print(col_index)
sns.boxplot(y='price', x=col, data=df, ax=axs[row_index, col_index])
plt.tight_layout()
plt.show()