import sys
import math
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
# import plotly
import plotly.express as px
!pip install ipywidgets
import ipywidgets as widgets
from ipywidgets import interact
sns.set_style('whitegrid')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
print(f'Python  version: {sys.version}')
print(f'pandas  version: {pd.__version__}')
print(f'numpy   version: {np.__version__}')
print(f'seaborn version: {sns.__version__}')
pd.Timestamp.now()
# pd.Timestamp.now().strftime('%Y-%m-%d')
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import socket # pip install socket
print(f'last run: {datetime.now()}')
try:
    print(f'ip addtress: {socket.gethostbyname(socket.gethostname())} ({socket.gethostname()})')
except:
    pass
def df_unique_value(df):
    for c in df.columns:
        col_type = df[c].dtype
        if col_type == 'object' or col_type.name == 'category':
            print(f'{c:10}\n{df[c].unique()}')
            print('-' * 65)
            
def convert_cols_to_category(df, cols:list):
    """
    convert `cols` to `category`
    """
    for c in df[cols]:
        df[c] = df[c].astype('category')
    return df
    
def convert_obj_columns_to_category(df):
    for c in df.columns:
        col_type = df[c].dtype
        if col_type == 'object' or col_type.name == 'category':
            df[c] = df[c].astype('category')
    return df
def print_category_columns(df):
    for c in df.columns:
        col_type = df[c].dtype
        if col_type.name == 'category':
    #         print(f'{c}: {df[c].cat.categories}')
    #         print(pd.Series(df[c].cat.categories))
            print(f'{c:15}: {list(enumerate(df[c].cat.categories))}')
            print('-' * 60)
            
def plot_mn(df, cols, n_rows:int=1, kind:str='boxplot', color='salmon'):
    """
    plot boxplot, violin, hist in m (rows) by n (columns)
    >>> plot_mn(df, ['Calories', 'Fat'], 2, 'hist')
    """
    n=len(cols)
    n_cols=math.ceil(n / n_rows)
    fig, ax = plt.subplots(n_rows, n_cols, figsize=(n_cols*3, n_rows*3.5))
    ax=ax.ravel()
    fig.tight_layout()
    for i, c in enumerate(cols):
        col_type = df[c].dtype
        if col_type.name == 'category':
            sns.countplot(data=df, x=c, ax=ax[i])
        else:
            if kind.lower()=='boxplot':
                sns.boxplot(data=df[[c]], ax=ax[i], color=color)   
            if kind.lower()=='boxen':
                sns.boxenplot(data=df[[c]], ax=ax[i], color=color)   
            elif kind.lower()=='violin':
                sns.violinplot(data=df[[c]], ax=ax[i], color=color)   
            elif kind.lower()=='hist':
                sns.distplot(df[c], hist=True, kde=False, ax=ax[i], color=color)  
            
mfg = ['bmw', 'vw', 'ford', 'toyota', 'hyundi']
dfs = []
for f in mfg:
    url = f'https://github.com/prasertcbs/basic-dataset/raw/master/q2/{f}.csv'
    print(url)
    dt = pd.read_csv(url, skipinitialspace=True)
    dt['mfg'] = f
    dfs.append(dt)
df = pd.concat(dfs)
df
df.columns
df['mileage_km'] = df['mileage']*1.60934
df['kml'] = df['mpg']*0.4251437075
df
df.info() 
df = df.dropna().reset_index(drop = True).copy()
df
df.drop_duplicates(inplace=True)
df.drop(columns=['mpg','mileage'],inplace=True)
df.isna().sum()
df = convert_obj_columns_to_category(df)
df[['mileage_km','kml']] = df[['mileage_km','kml']].apply(lambda x: round(number=x, ndigits=2))
df
df.info()
df.nlargest(10, 'kml').style.background_gradient(cmap='Blues',subset=["kml"])
plot_mn(df, df.columns, 3, 'boxen')
df.sort_values('kml', ascending=False).head(10).style.background_gradient(cmap='Blues',subset=["kml"])\
                        .background_gradient(cmap='Reds',subset=["engineSize"])
df.drop(df[df['fuelType']=='Other'].index,inplace=True)
df.drop(df[df['fuelType']=='Hybrid'].index,inplace=True)
df.drop(df[df['fuelType']=='Electric'].index,inplace=True)
fuelType_count=df['fuelType'].value_counts()
pd.DataFrame(fuelType_count).style.background_gradient(cmap='Greens')
# ax = sns.countplot(df.fuelType, palette="ch:.1", order=['Petrol','Diesel'])
# import plotly.io as pio
px.histogram(df.fuelType,x='fuelType',color='fuelType',title = 'Count_Plot_fuelType')
# pio.write_json(fig, 'countplot.plotly')
# fig_styled = pio.read_json('countplot.plotly')
# fig_styled
df.reset_index(drop='True')
df.columns
cols=['model', 'year', 'price', 'transmission', 'fuelType', 'engineSize',
       'mfg', 'mileage_km', 'kml']
dcorr = df[cols].corr()
# dcorr
mask = np.zeros_like(dcorr)
# mask.shape
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots(figsize=(10,8)) 
sns.heatmap(dcorr, cmap=sns.diverging_palette(10, 145, n=100), 
            vmin=-1, vmax=1, center=0, linewidths=1, annot=True, mask=mask, ax=ax).set_title("Correlation\nHeatmap", fontsize=22,fontweight="bold");
feature_cols=['year','transmission', 'fuelType', 'engineSize',
       'mfg', 'mileage_km', 'kml']
target_col='price'
X=df[feature_cols]
y=df[target_col]
from sklearn.model_selection import train_test_split,cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
!pip install flaml
from flaml import AutoML
automl = AutoML()
settings = {
    "time_budget": 100,  # total running time in seconds
    "metric": 'r2',  # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "estimator_list": ['lgbm'],  # list of ML learners; we tune lightgbm in this example
    "task": 'regression',  # task type    
    "log_file_name": 'UK_used_car_price.log',  # flaml log file
}
'''The main flaml automl API'''
automl.fit(X_train=X_train, y_train=y_train, **settings)
automl.model
''' compute predictions of testing dataset ''' 
y_pred = automl.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
''' compute different metric values on testing dataset'''
from flaml.ml import sklearn_metric_loss_score
print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))
print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
    get_output_from_log(filename=settings['log_file_name'], time_budget=60)
for config in config_history:
    print(config)
plt.title('Learning Curve')
plt.xlabel('Wall Clock Time (s)')
plt.ylabel('Validation r2')
plt.scatter(time_history, 1 - np.array(valid_loss_history))
plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
plt.show()
automl.best_iteration
print('flaml r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
import lightgbm as lgb
print(f'lightgbm version = {lgb.__version__}')
params={'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 5.0, 
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0
}
# reg = lgb.LGBMRegressor()
lgbm = lgb.LGBMRegressor(**params)
fit_params={'early_stopping_rounds':100, 
            'eval_set': [(X_test, y_test)],
            'verbose': 10,
           }
lgbm.fit(X_train, y_train, **fit_params) # with early_stopping and lgb.plot_metric
y_pred = lgbm.predict(X_test)
from flaml.ml import sklearn_metric_loss_score
print('default lgbm r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
lgb.plot_metric(lgbm) #หลังจากเทรนด์ไปประมาณ 20 รอบ l2 ที่ได้ก็เริ่มคงที่
lgbm.score(X_train, y_train)
lgbm.predict(X_test[:5])
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5, test_size=0.2)
scores = cross_val_score(lgbm, X, y, cv=cv)
print(scores)
print(f'mean scores = {scores.mean()}, sd={scores.std():.4f}')                                         
!pip install -U graphviz
show_info=['split_gain', 'internal_value', 'data_percentage', 'leaf_count']
orientation='vertical'
max_num_trees=lgbm.__dict__['_Booster'].num_trees()
w_tree_index=widgets.BoundedIntText(
    value=0,
    min=0,
    max=max_num_trees-1,
    step=1,
    continuous_update=True,
)
w_max_depth=widgets.BoundedIntText(
    value=3,
    min=2,
    max=10,
    step=1,
    continuous_update=True,
)
w_n_estimators=widgets.BoundedIntText(
    value=3,
    min=1,
    max=100,
    step=1,
    continuous_update=True,
)
def plot_tree(tree_index=w_tree_index, orientation=['vertical', 'horizontal'], max_depth=w_max_depth, n_estimators=w_n_estimators, save_tree_img=[False, True]):
    params={
        'subsample': 1.0,
        'n_estimators': n_estimators,
        'max_depth': max_depth
    }
    
    case_index=1
    print(X_test.iloc[[case_index]])
    print(f'predicted value = {lgbm.predict(X_test.iloc[[case_index]])}')
    w_tree_index.max=w_n_estimators.value-1
    g=lgb.create_tree_digraph(lgbm, orientation=orientation, tree_index=tree_index, show_info=show_info, precision=4)
    if save_tree_img:
        g.format='png'
        g.render(f'tree{tree_index}', view=False, cleanup=True)
    return g
!pip install shap
import shap
print(f'shap version {shap.__version__}')
# load JS visualization code to notebook
shap.initjs()
def case_detail(case_data):
    '''
    format obj returned from shap.force_plot()
    '''
    de=pd.DataFrame(case_data.data['features'])
    fcols=[]
    for i in case_data.data['features'].keys():
        fcols.append(case_data.data['featureNames'][i])
    de.columns=fcols
    return de
def individual_case_plot(explainer, X, case_index, verbose=False):
    """
    >>> individual_case_plot(explainer, X_train, 1)
    """
    shap_values = explainer.shap_values(X.iloc[[case_index]])
    g=shap.force_plot(explainer.expected_value, shap_values=shap_values, features=X.iloc[case_index, :])
    if verbose:
        pprint(g.__dict__)
    return g
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X)
X
shap_values[:3]
dshap=pd.DataFrame(shap_values, columns=feature_cols)
dshap
# mean(abs(SHAP value)): average impact on model output magnitude
feature_imp = np.abs(dshap).mean().sort_values(ascending=False)
pd.DataFrame(feature_imp).style.background_gradient(cmap='Blues')
shap.summary_plot(dshap, X, plot_type="bar")
# summarize the effects of all the features
shap.summary_plot(shap_values, X)