import sys
import math
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
# import plotly
import plotly.express as px
!pip install ipywidgets
import ipywidgets as widgets
from ipywidgets import interact
sns.set_style('whitegrid')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
print(f'Python version: {sys.version}')
print(f'pandas version: {pd.__version__}')
print(f'numpy version: {np.__version__}')
print(f'seaborn version: {sns.__version__}')
pd.Timestamp.now()
# pd.Timestamp.now().strftime('%Y-%m-%d')
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import socket # pip install socket
print(f'last run: {datetime.now()}')
try:
print(f'ip addtress: {socket.gethostbyname(socket.gethostname())} ({socket.gethostname()})')
except:
pass
def df_unique_value(df):
for c in df.columns:
col_type = df[c].dtype
if col_type == 'object' or col_type.name == 'category':
print(f'{c:10}\n{df[c].unique()}')
print('-' * 65)
def convert_cols_to_category(df, cols:list):
"""
convert `cols` to `category`
"""
for c in df[cols]:
df[c] = df[c].astype('category')
return df
def convert_obj_columns_to_category(df):
for c in df.columns:
col_type = df[c].dtype
if col_type == 'object' or col_type.name == 'category':
df[c] = df[c].astype('category')
return df
def print_category_columns(df):
for c in df.columns:
col_type = df[c].dtype
if col_type.name == 'category':
# print(f'{c}: {df[c].cat.categories}')
# print(pd.Series(df[c].cat.categories))
print(f'{c:15}: {list(enumerate(df[c].cat.categories))}')
print('-' * 60)
def plot_mn(df, cols, n_rows:int=1, kind:str='boxplot', color='salmon'):
"""
plot boxplot, violin, hist in m (rows) by n (columns)
>>> plot_mn(df, ['Calories', 'Fat'], 2, 'hist')
"""
n=len(cols)
n_cols=math.ceil(n / n_rows)
fig, ax = plt.subplots(n_rows, n_cols, figsize=(n_cols*3, n_rows*3.5))
ax=ax.ravel()
fig.tight_layout()
for i, c in enumerate(cols):
col_type = df[c].dtype
if col_type.name == 'category':
sns.countplot(data=df, x=c, ax=ax[i])
else:
if kind.lower()=='boxplot':
sns.boxplot(data=df[[c]], ax=ax[i], color=color)
if kind.lower()=='boxen':
sns.boxenplot(data=df[[c]], ax=ax[i], color=color)
elif kind.lower()=='violin':
sns.violinplot(data=df[[c]], ax=ax[i], color=color)
elif kind.lower()=='hist':
sns.distplot(df[c], hist=True, kde=False, ax=ax[i], color=color)
mfg = ['bmw', 'vw', 'ford', 'toyota', 'hyundi']
dfs = []
for f in mfg:
url = f'https://github.com/prasertcbs/basic-dataset/raw/master/q2/{f}.csv'
print(url)
dt = pd.read_csv(url, skipinitialspace=True)
dt['mfg'] = f
dfs.append(dt)
df = pd.concat(dfs)
df
df.columns
df['mileage_km'] = df['mileage']*1.60934
df['kml'] = df['mpg']*0.4251437075
df
df.info()
df = df.dropna().reset_index(drop = True).copy()
df
df.drop_duplicates(inplace=True)
df.drop(columns=['mpg','mileage'],inplace=True)
df.isna().sum()
df = convert_obj_columns_to_category(df)
df[['mileage_km','kml']] = df[['mileage_km','kml']].apply(lambda x: round(number=x, ndigits=2))
df
df.info()
df.nlargest(10, 'kml').style.background_gradient(cmap='Blues',subset=["kml"])
plot_mn(df, df.columns, 3, 'boxen')
df.sort_values('kml', ascending=False).head(10).style.background_gradient(cmap='Blues',subset=["kml"])\
.background_gradient(cmap='Reds',subset=["engineSize"])
df.drop(df[df['fuelType']=='Other'].index,inplace=True)
df.drop(df[df['fuelType']=='Hybrid'].index,inplace=True)
df.drop(df[df['fuelType']=='Electric'].index,inplace=True)
fuelType_count=df['fuelType'].value_counts()
pd.DataFrame(fuelType_count).style.background_gradient(cmap='Greens')
# ax = sns.countplot(df.fuelType, palette="ch:.1", order=['Petrol','Diesel'])
# import plotly.io as pio
px.histogram(df.fuelType,x='fuelType',color='fuelType',title = 'Count_Plot_fuelType')
# pio.write_json(fig, 'countplot.plotly')
# fig_styled = pio.read_json('countplot.plotly')
# fig_styled
df.reset_index(drop='True')
df.columns
cols=['model', 'year', 'price', 'transmission', 'fuelType', 'engineSize',
'mfg', 'mileage_km', 'kml']
dcorr = df[cols].corr()
# dcorr
mask = np.zeros_like(dcorr)
# mask.shape
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(dcorr, cmap=sns.diverging_palette(10, 145, n=100),
vmin=-1, vmax=1, center=0, linewidths=1, annot=True, mask=mask, ax=ax).set_title("Correlation\nHeatmap", fontsize=22,fontweight="bold");
feature_cols=['year','transmission', 'fuelType', 'engineSize',
'mfg', 'mileage_km', 'kml']
target_col='price'
X=df[feature_cols]
y=df[target_col]
from sklearn.model_selection import train_test_split,cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
!pip install flaml
from flaml import AutoML
automl = AutoML()
settings = {
"time_budget": 100, # total running time in seconds
"metric": 'r2', # primary metrics for regression can be chosen from: ['mae','mse','r2']
"estimator_list": ['lgbm'], # list of ML learners; we tune lightgbm in this example
"task": 'regression', # task type
"log_file_name": 'UK_used_car_price.log', # flaml log file
}
'''The main flaml automl API'''
automl.fit(X_train=X_train, y_train=y_train, **settings)
automl.model
''' compute predictions of testing dataset '''
y_pred = automl.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
''' compute different metric values on testing dataset'''
from flaml.ml import sklearn_metric_loss_score
print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))
print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
get_output_from_log(filename=settings['log_file_name'], time_budget=60)
for config in config_history:
print(config)
plt.title('Learning Curve')
plt.xlabel('Wall Clock Time (s)')
plt.ylabel('Validation r2')
plt.scatter(time_history, 1 - np.array(valid_loss_history))
plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
plt.show()
automl.best_iteration
print('flaml r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
import lightgbm as lgb
print(f'lightgbm version = {lgb.__version__}')
params={'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 1.0,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': -1,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 100,
'n_jobs': -1,
'num_leaves': 31,
'objective': None,
'random_state': None,
'reg_alpha': 5.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 1.0,
'subsample_for_bin': 200000,
'subsample_freq': 0
}
# reg = lgb.LGBMRegressor()
lgbm = lgb.LGBMRegressor(**params)
fit_params={'early_stopping_rounds':100,
'eval_set': [(X_test, y_test)],
'verbose': 10,
}
lgbm.fit(X_train, y_train, **fit_params) # with early_stopping and lgb.plot_metric
y_pred = lgbm.predict(X_test)
from flaml.ml import sklearn_metric_loss_score
print('default lgbm r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
lgb.plot_metric(lgbm) #หลังจากเทรนด์ไปประมาณ 20 รอบ l2 ที่ได้ก็เริ่มคงที่
lgbm.score(X_train, y_train)
lgbm.predict(X_test[:5])
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5, test_size=0.2)
scores = cross_val_score(lgbm, X, y, cv=cv)
print(scores)
print(f'mean scores = {scores.mean()}, sd={scores.std():.4f}')
!pip install -U graphviz
show_info=['split_gain', 'internal_value', 'data_percentage', 'leaf_count']
orientation='vertical'
max_num_trees=lgbm.__dict__['_Booster'].num_trees()
w_tree_index=widgets.BoundedIntText(
value=0,
min=0,
max=max_num_trees-1,
step=1,
continuous_update=True,
)
w_max_depth=widgets.BoundedIntText(
value=3,
min=2,
max=10,
step=1,
continuous_update=True,
)
w_n_estimators=widgets.BoundedIntText(
value=3,
min=1,
max=100,
step=1,
continuous_update=True,
)
@interact
def plot_tree(tree_index=w_tree_index, orientation=['vertical', 'horizontal'], max_depth=w_max_depth, n_estimators=w_n_estimators, save_tree_img=[False, True]):
params={
'subsample': 1.0,
'n_estimators': n_estimators,
'max_depth': max_depth
}
case_index=1
print(X_test.iloc[[case_index]])
print(f'predicted value = {lgbm.predict(X_test.iloc[[case_index]])}')
w_tree_index.max=w_n_estimators.value-1
g=lgb.create_tree_digraph(lgbm, orientation=orientation, tree_index=tree_index, show_info=show_info, precision=4)
if save_tree_img:
g.format='png'
g.render(f'tree{tree_index}', view=False, cleanup=True)
return g
!pip install shap
import shap
print(f'shap version {shap.__version__}')
# load JS visualization code to notebook
shap.initjs()
def case_detail(case_data):
'''
format obj returned from shap.force_plot()
'''
de=pd.DataFrame(case_data.data['features'])
fcols=[]
for i in case_data.data['features'].keys():
fcols.append(case_data.data['featureNames'][i])
de.columns=fcols
return de
def individual_case_plot(explainer, X, case_index, verbose=False):
"""
>>> individual_case_plot(explainer, X_train, 1)
"""
shap_values = explainer.shap_values(X.iloc[[case_index]])
g=shap.force_plot(explainer.expected_value, shap_values=shap_values, features=X.iloc[case_index, :])
if verbose:
pprint(g.__dict__)
return g
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X)
X
shap_values[:3]
dshap=pd.DataFrame(shap_values, columns=feature_cols)
dshap
# mean(abs(SHAP value)): average impact on model output magnitude
feature_imp = np.abs(dshap).mean().sort_values(ascending=False)
pd.DataFrame(feature_imp).style.background_gradient(cmap='Blues')
shap.summary_plot(dshap, X, plot_type="bar")
# summarize the effects of all the features
shap.summary_plot(shap_values, X)