UK-used-cars-price-Analysis

import sys import math import pandas as pd import seaborn as sns import numpy as np import matplotlib.pyplot as plt # import plotly import plotly.express as px !pip install ipywidgets import ipywidgets as widgets from ipywidgets import interact sns.set_style('whitegrid') %matplotlib inline %config InlineBackend.figure_format = 'retina'

print(f'Python version: {sys.version}') print(f'pandas version: {pd.__version__}') print(f'numpy version: {np.__version__}') print(f'seaborn version: {sns.__version__}') pd.Timestamp.now() # pd.Timestamp.now().strftime('%Y-%m-%d')

import warnings warnings.filterwarnings('ignore')

from datetime import datetime import socket # pip install socket print(f'last run: {datetime.now()}') try: print(f'ip addtress: {socket.gethostbyname(socket.gethostname())} ({socket.gethostname()})') except: pass

def df_unique_value(df): for c in df.columns: col_type = df[c].dtype if col_type == 'object' or col_type.name == 'category': print(f'{c:10}\n{df[c].unique()}') print('-' * 65) def convert_cols_to_category(df, cols:list): """ convert `cols` to `category` """ for c in df[cols]: df[c] = df[c].astype('category') return df def convert_obj_columns_to_category(df): for c in df.columns: col_type = df[c].dtype if col_type == 'object' or col_type.name == 'category': df[c] = df[c].astype('category') return df def print_category_columns(df): for c in df.columns: col_type = df[c].dtype if col_type.name == 'category': # print(f'{c}: {df[c].cat.categories}') # print(pd.Series(df[c].cat.categories)) print(f'{c:15}: {list(enumerate(df[c].cat.categories))}') print('-' * 60) def plot_mn(df, cols, n_rows:int=1, kind:str='boxplot', color='salmon'): """ plot boxplot, violin, hist in m (rows) by n (columns) >>> plot_mn(df, ['Calories', 'Fat'], 2, 'hist') """ n=len(cols) n_cols=math.ceil(n / n_rows) fig, ax = plt.subplots(n_rows, n_cols, figsize=(n_cols*3, n_rows*3.5)) ax=ax.ravel() fig.tight_layout() for i, c in enumerate(cols): col_type = df[c].dtype if col_type.name == 'category': sns.countplot(data=df, x=c, ax=ax[i]) else: if kind.lower()=='boxplot': sns.boxplot(data=df[[c]], ax=ax[i], color=color) if kind.lower()=='boxen': sns.boxenplot(data=df[[c]], ax=ax[i], color=color) elif kind.lower()=='violin': sns.violinplot(data=df[[c]], ax=ax[i], color=color) elif kind.lower()=='hist': sns.distplot(df[c], hist=True, kde=False, ax=ax[i], color=color)

mfg = ['bmw', 'vw', 'ford', 'toyota', 'hyundi'] dfs = [] for f in mfg: url = f'https://github.com/prasertcbs/basic-dataset/raw/master/q2/{f}.csv' print(url) dt = pd.read_csv(url, skipinitialspace=True) dt['mfg'] = f dfs.append(dt) df = pd.concat(dfs) df

df.columns

df['mileage_km'] = df['mileage']*1.60934 df['kml'] = df['mpg']*0.4251437075 df

df.info()

df = df.dropna().reset_index(drop = True).copy() df

df.drop_duplicates(inplace=True) df.drop(columns=['mpg','mileage'],inplace=True)

df.isna().sum()

df = convert_obj_columns_to_category(df) df[['mileage_km','kml']] = df[['mileage_km','kml']].apply(lambda x: round(number=x, ndigits=2)) df

df.info()

df.nlargest(10, 'kml').style.background_gradient(cmap='Blues',subset=["kml"])

plot_mn(df, df.columns, 3, 'boxen')

df.sort_values('kml', ascending=False).head(10).style.background_gradient(cmap='Blues',subset=["kml"])\ .background_gradient(cmap='Reds',subset=["engineSize"])

df.drop(df[df['fuelType']=='Other'].index,inplace=True) df.drop(df[df['fuelType']=='Hybrid'].index,inplace=True) df.drop(df[df['fuelType']=='Electric'].index,inplace=True)

fuelType_count=df['fuelType'].value_counts() pd.DataFrame(fuelType_count).style.background_gradient(cmap='Greens')

# ax = sns.countplot(df.fuelType, palette="ch:.1", order=['Petrol','Diesel']) # import plotly.io as pio px.histogram(df.fuelType,x='fuelType',color='fuelType',title = 'Count_Plot_fuelType') # pio.write_json(fig, 'countplot.plotly') # fig_styled = pio.read_json('countplot.plotly') # fig_styled

df.reset_index(drop='True')

df.columns

cols=['model', 'year', 'price', 'transmission', 'fuelType', 'engineSize', 'mfg', 'mileage_km', 'kml']

dcorr = df[cols].corr() # dcorr mask = np.zeros_like(dcorr) # mask.shape mask[np.triu_indices_from(mask)] = True fig, ax = plt.subplots(figsize=(10,8)) sns.heatmap(dcorr, cmap=sns.diverging_palette(10, 145, n=100), vmin=-1, vmax=1, center=0, linewidths=1, annot=True, mask=mask, ax=ax).set_title("Correlation\nHeatmap", fontsize=22,fontweight="bold");

feature_cols=['year','transmission', 'fuelType', 'engineSize', 'mfg', 'mileage_km', 'kml'] target_col='price' X=df[feature_cols] y=df[target_col]

from sklearn.model_selection import train_test_split,cross_val_score X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

!pip install flaml from flaml import AutoML automl = AutoML()

settings = { "time_budget": 100, # total running time in seconds "metric": 'r2', # primary metrics for regression can be chosen from: ['mae','mse','r2'] "estimator_list": ['lgbm'], # list of ML learners; we tune lightgbm in this example "task": 'regression', # task type "log_file_name": 'UK_used_car_price.log', # flaml log file }

'''The main flaml automl API''' automl.fit(X_train=X_train, y_train=y_train, **settings)

automl.model

''' compute predictions of testing dataset ''' y_pred = automl.predict(X_test) print('Predicted labels', y_pred) print('True labels', y_test)

''' compute different metric values on testing dataset''' from flaml.ml import sklearn_metric_loss_score print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test)) print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test)) print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))

from flaml.data import get_output_from_log time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \ get_output_from_log(filename=settings['log_file_name'], time_budget=60) for config in config_history: print(config)

plt.title('Learning Curve') plt.xlabel('Wall Clock Time (s)') plt.ylabel('Validation r2') plt.scatter(time_history, 1 - np.array(valid_loss_history)) plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post') plt.show()

automl.best_iteration

print('flaml r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))

import lightgbm as lgb print(f'lightgbm version = {lgb.__version__}')

params={'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 5.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0 } # reg = lgb.LGBMRegressor() lgbm = lgb.LGBMRegressor(**params)

fit_params={'early_stopping_rounds':100, 'eval_set': [(X_test, y_test)], 'verbose': 10, } lgbm.fit(X_train, y_train, **fit_params) # with early_stopping and lgb.plot_metric

y_pred = lgbm.predict(X_test) from flaml.ml import sklearn_metric_loss_score print('default lgbm r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))

lgb.plot_metric(lgbm) #หลังจากเทรนด์ไปประมาณ 20 รอบ l2 ที่ได้ก็เริ่มคงที่

lgbm.score(X_train, y_train)

lgbm.predict(X_test[:5])

from sklearn.model_selection import ShuffleSplit cv = ShuffleSplit(n_splits=5, test_size=0.2) scores = cross_val_score(lgbm, X, y, cv=cv) print(scores) print(f'mean scores = {scores.mean()}, sd={scores.std():.4f}')

!pip install -U graphviz show_info=['split_gain', 'internal_value', 'data_percentage', 'leaf_count'] orientation='vertical' max_num_trees=lgbm.__dict__['_Booster'].num_trees() w_tree_index=widgets.BoundedIntText( value=0, min=0, max=max_num_trees-1, step=1, continuous_update=True, ) w_max_depth=widgets.BoundedIntText( value=3, min=2, max=10, step=1, continuous_update=True, ) w_n_estimators=widgets.BoundedIntText( value=3, min=1, max=100, step=1, continuous_update=True, ) @interact def plot_tree(tree_index=w_tree_index, orientation=['vertical', 'horizontal'], max_depth=w_max_depth, n_estimators=w_n_estimators, save_tree_img=[False, True]): params={ 'subsample': 1.0, 'n_estimators': n_estimators, 'max_depth': max_depth } case_index=1 print(X_test.iloc[[case_index]]) print(f'predicted value = {lgbm.predict(X_test.iloc[[case_index]])}') w_tree_index.max=w_n_estimators.value-1 g=lgb.create_tree_digraph(lgbm, orientation=orientation, tree_index=tree_index, show_info=show_info, precision=4) if save_tree_img: g.format='png' g.render(f'tree{tree_index}', view=False, cleanup=True) return g

!pip install shap import shap print(f'shap version {shap.__version__}') # load JS visualization code to notebook shap.initjs()

def case_detail(case_data): ''' format obj returned from shap.force_plot() ''' de=pd.DataFrame(case_data.data['features']) fcols=[] for i in case_data.data['features'].keys(): fcols.append(case_data.data['featureNames'][i]) de.columns=fcols return de def individual_case_plot(explainer, X, case_index, verbose=False): """ >>> individual_case_plot(explainer, X_train, 1) """ shap_values = explainer.shap_values(X.iloc[[case_index]]) g=shap.force_plot(explainer.expected_value, shap_values=shap_values, features=X.iloc[case_index, :]) if verbose: pprint(g.__dict__) return g

# explain the model's predictions using SHAP explainer = shap.TreeExplainer(lgbm) shap_values = explainer.shap_values(X)

shap_values[:3]

dshap=pd.DataFrame(shap_values, columns=feature_cols) dshap

# mean(abs(SHAP value)): average impact on model output magnitude feature_imp = np.abs(dshap).mean().sort_values(ascending=False) pd.DataFrame(feature_imp).style.background_gradient(cmap='Blues')

shap.summary_plot(dshap, X, plot_type="bar")

# summarize the effects of all the features shap.summary_plot(shap_values, X)