import sys
import math
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
# import plotly
import plotly.express as px
!pip install ipywidgets
import ipywidgets as widgets
from ipywidgets import interact
sns.set_style('whitegrid')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Requirement already satisfied: ipywidgets in /root/venv/lib/python3.7/site-packages (7.6.5)
Requirement already satisfied: ipython>=4.0.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipywidgets) (7.28.0)
Requirement already satisfied: jupyterlab-widgets>=1.0.0 in /root/venv/lib/python3.7/site-packages (from ipywidgets) (1.0.2)
Requirement already satisfied: nbformat>=4.2.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipywidgets) (5.1.3)
Requirement already satisfied: traitlets>=4.3.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipywidgets) (4.3.3)
Requirement already satisfied: ipython-genutils~=0.2.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipywidgets) (0.2.0)
Requirement already satisfied: widgetsnbextension~=3.5.0 in /root/venv/lib/python3.7/site-packages (from ipywidgets) (3.5.1)
Requirement already satisfied: ipykernel>=4.5.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipywidgets) (5.5.5)
Requirement already satisfied: jupyter-client in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipykernel>=4.5.1->ipywidgets) (6.1.12)
Requirement already satisfied: tornado>=4.2 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipykernel>=4.5.1->ipywidgets) (6.1)
Requirement already satisfied: decorator in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipython>=4.0.0->ipywidgets) (5.1.0)
Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipython>=4.0.0->ipywidgets) (3.0.20)
Requirement already satisfied: jedi>=0.16 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipython>=4.0.0->ipywidgets) (0.17.2)
Requirement already satisfied: pygments in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipython>=4.0.0->ipywidgets) (2.10.0)
Requirement already satisfied: pexpect>4.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipython>=4.0.0->ipywidgets) (4.8.0)
Requirement already satisfied: setuptools>=18.5 in /root/venv/lib/python3.7/site-packages (from ipython>=4.0.0->ipywidgets) (58.1.0)
Requirement already satisfied: pickleshare in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipython>=4.0.0->ipywidgets) (0.7.5)
Requirement already satisfied: backcall in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipython>=4.0.0->ipywidgets) (0.2.0)
Requirement already satisfied: matplotlib-inline in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from ipython>=4.0.0->ipywidgets) (0.1.3)
Requirement already satisfied: parso<0.8.0,>=0.7.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from jedi>=0.16->ipython>=4.0.0->ipywidgets) (0.7.1)
Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from nbformat>=4.2.0->ipywidgets) (3.2.0)
Requirement already satisfied: jupyter-core in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from nbformat>=4.2.0->ipywidgets) (4.7.1)
Requirement already satisfied: pyrsistent>=0.14.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets) (0.18.0)
Requirement already satisfied: importlib-metadata in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets) (4.8.1)
Requirement already satisfied: attrs>=17.4.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets) (21.2.0)
Requirement already satisfied: six>=1.11.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets) (1.16.0)
Requirement already satisfied: ptyprocess>=0.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pexpect>4.3->ipython>=4.0.0->ipywidgets) (0.7.0)
Requirement already satisfied: wcwidth in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=4.0.0->ipywidgets) (0.2.5)
Requirement already satisfied: notebook>=4.4.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from widgetsnbextension~=3.5.0->ipywidgets) (6.3.0)
Requirement already satisfied: prometheus-client in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.11.0)
Requirement already satisfied: jinja2 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (3.0.2)
Requirement already satisfied: terminado>=0.8.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.12.1)
Requirement already satisfied: argon2-cffi in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (21.1.0)
Requirement already satisfied: Send2Trash>=1.5.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.8.0)
Requirement already satisfied: pyzmq>=17 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (22.3.0)
Requirement already satisfied: nbconvert==6.0.7 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (6.0.7)
Requirement already satisfied: python-dateutil>=2.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (2.8.2)
Requirement already satisfied: entrypoints>=0.2.2 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from nbconvert==6.0.7->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.3)
Requirement already satisfied: defusedxml in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from nbconvert==6.0.7->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.7.1)
Requirement already satisfied: mistune<2,>=0.8.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from nbconvert==6.0.7->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.8.4)
Requirement already satisfied: nbclient<0.6.0,>=0.5.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from nbconvert==6.0.7->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.5.4)
Requirement already satisfied: bleach in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from nbconvert==6.0.7->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (4.1.0)
Requirement already satisfied: testpath in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from nbconvert==6.0.7->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.5.0)
Requirement already satisfied: pandocfilters>=1.4.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from nbconvert==6.0.7->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.5.0)
Requirement already satisfied: jupyterlab-pygments in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from nbconvert==6.0.7->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.1.2)
Requirement already satisfied: MarkupSafe>=2.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from jinja2->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (2.0.1)
Requirement already satisfied: nest-asyncio in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert==6.0.7->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.5.1)
Requirement already satisfied: cffi>=1.0.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (1.14.6)
Requirement already satisfied: pycparser in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from cffi>=1.0.0->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (2.20)
Requirement already satisfied: webencodings in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from bleach->nbconvert==6.0.7->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (0.5.1)
Requirement already satisfied: packaging in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from bleach->nbconvert==6.0.7->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (21.0)
Requirement already satisfied: zipp>=0.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata->jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets) (3.6.0)
Requirement already satisfied: typing-extensions>=3.6.4 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata->jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets) (3.10.0.2)
Requirement already satisfied: pyparsing>=2.0.2 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from packaging->bleach->nbconvert==6.0.7->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets) (2.4.7)
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
print(f'Python version: {sys.version}')
print(f'pandas version: {pd.__version__}')
print(f'numpy version: {np.__version__}')
print(f'seaborn version: {sns.__version__}')
pd.Timestamp.now()
# pd.Timestamp.now().strftime('%Y-%m-%d')
Python version: 3.7.12 (default, Oct 12 2021, 03:36:26)
[GCC 8.3.0]
pandas version: 1.2.5
numpy version: 1.19.5
seaborn version: 0.11.2
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import socket # pip install socket
print(f'last run: {datetime.now()}')
try:
print(f'ip addtress: {socket.gethostbyname(socket.gethostname())} ({socket.gethostname()})')
except:
pass
last run: 2021-10-22 16:49:14.833760
ip addtress: 172.3.49.191 (p-d76f3d72-4f5e-4ae5-9e32-1e304bde2c7d)
def df_unique_value(df):
for c in df.columns:
col_type = df[c].dtype
if col_type == 'object' or col_type.name == 'category':
print(f'{c:10}\n{df[c].unique()}')
print('-' * 65)
def convert_cols_to_category(df, cols:list):
"""
convert `cols` to `category`
"""
for c in df[cols]:
df[c] = df[c].astype('category')
return df
def convert_obj_columns_to_category(df):
for c in df.columns:
col_type = df[c].dtype
if col_type == 'object' or col_type.name == 'category':
df[c] = df[c].astype('category')
return df
def print_category_columns(df):
for c in df.columns:
col_type = df[c].dtype
if col_type.name == 'category':
# print(f'{c}: {df[c].cat.categories}')
# print(pd.Series(df[c].cat.categories))
print(f'{c:15}: {list(enumerate(df[c].cat.categories))}')
print('-' * 60)
def plot_mn(df, cols, n_rows:int=1, kind:str='boxplot', color='salmon'):
"""
plot boxplot, violin, hist in m (rows) by n (columns)
>>> plot_mn(df, ['Calories', 'Fat'], 2, 'hist')
"""
n=len(cols)
n_cols=math.ceil(n / n_rows)
fig, ax = plt.subplots(n_rows, n_cols, figsize=(n_cols*3, n_rows*3.5))
ax=ax.ravel()
fig.tight_layout()
for i, c in enumerate(cols):
col_type = df[c].dtype
if col_type.name == 'category':
sns.countplot(data=df, x=c, ax=ax[i])
else:
if kind.lower()=='boxplot':
sns.boxplot(data=df[[c]], ax=ax[i], color=color)
if kind.lower()=='boxen':
sns.boxenplot(data=df[[c]], ax=ax[i], color=color)
elif kind.lower()=='violin':
sns.violinplot(data=df[[c]], ax=ax[i], color=color)
elif kind.lower()=='hist':
sns.distplot(df[c], hist=True, kde=False, ax=ax[i], color=color)
mfg = ['bmw', 'vw', 'ford', 'toyota', 'hyundi']
dfs = []
for f in mfg:
url = f'https://github.com/prasertcbs/basic-dataset/raw/master/q2/{f}.csv'
print(url)
dt = pd.read_csv(url, skipinitialspace=True)
dt['mfg'] = f
dfs.append(dt)
df = pd.concat(dfs)
df
https://github.com/prasertcbs/basic-dataset/raw/master/q2/bmw.csv
https://github.com/prasertcbs/basic-dataset/raw/master/q2/vw.csv
https://github.com/prasertcbs/basic-dataset/raw/master/q2/ford.csv
https://github.com/prasertcbs/basic-dataset/raw/master/q2/toyota.csv
https://github.com/prasertcbs/basic-dataset/raw/master/q2/hyundi.csv
df.columns
df['mileage_km'] = df['mileage']*1.60934
df['kml'] = df['mpg']*0.4251437075
df
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7500 entries, 0 to 1499
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 model 7380 non-null object
1 year 7380 non-null float64
2 price 7500 non-null int64
3 transmission 7440 non-null object
4 mileage 7320 non-null float64
5 fuelType 7320 non-null object
6 mpg 7440 non-null float64
7 engineSize 7320 non-null float64
8 mfg 7500 non-null object
9 mileage_km 7320 non-null float64
10 kml 7440 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 703.1+ KB
df = df.dropna().reset_index(drop = True).copy()
df
df.drop_duplicates(inplace=True)
df.drop(columns=['mpg','mileage'],inplace=True)
df.isna().sum()
df = convert_obj_columns_to_category(df)
df[['mileage_km','kml']] = df[['mileage_km','kml']].apply(lambda x: round(number=x, ndigits=2))
df
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6606 entries, 0 to 6632
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 model 6606 non-null category
1 year 6606 non-null float64
2 price 6606 non-null int64
3 transmission 6606 non-null category
4 fuelType 6606 non-null category
5 engineSize 6606 non-null float64
6 mfg 6606 non-null category
7 mileage_km 6606 non-null float64
8 kml 6606 non-null float64
dtypes: category(4), float64(4), int64(1)
memory usage: 338.8 KB
df.nlargest(10, 'kml').style.background_gradient(cmap='Blues',subset=["kml"])
plot_mn(df, df.columns, 3, 'boxen')
df.sort_values('kml', ascending=False).head(10).style.background_gradient(cmap='Blues',subset=["kml"])\
.background_gradient(cmap='Reds',subset=["engineSize"])
df.drop(df[df['fuelType']=='Other'].index,inplace=True)
df.drop(df[df['fuelType']=='Hybrid'].index,inplace=True)
df.drop(df[df['fuelType']=='Electric'].index,inplace=True)
fuelType_count=df['fuelType'].value_counts()
pd.DataFrame(fuelType_count).style.background_gradient(cmap='Greens')
# ax = sns.countplot(df.fuelType, palette="ch:.1", order=['Petrol','Diesel'])
# import plotly.io as pio
px.histogram(df.fuelType,x='fuelType',color='fuelType',title = 'Count_Plot_fuelType')
# pio.write_json(fig, 'countplot.plotly')
# fig_styled = pio.read_json('countplot.plotly')
# fig_styled
df.reset_index(drop='True')
df.columns
cols=['model', 'year', 'price', 'transmission', 'fuelType', 'engineSize',
'mfg', 'mileage_km', 'kml']
dcorr = df[cols].corr()
# dcorr
mask = np.zeros_like(dcorr)
# mask.shape
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(dcorr, cmap=sns.diverging_palette(10, 145, n=100),
vmin=-1, vmax=1, center=0, linewidths=1, annot=True, mask=mask, ax=ax).set_title("Correlation\nHeatmap", fontsize=22,fontweight="bold");
feature_cols=['year','transmission', 'fuelType', 'engineSize',
'mfg', 'mileage_km', 'kml']
target_col='price'
X=df[feature_cols]
y=df[target_col]
from sklearn.model_selection import train_test_split,cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
!pip install flaml
from flaml import AutoML
automl = AutoML()
Requirement already satisfied: flaml in /root/venv/lib/python3.7/site-packages (0.6.9)
Requirement already satisfied: scipy>=1.4.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from flaml) (1.7.1)
Requirement already satisfied: lightgbm>=2.3.1 in /root/venv/lib/python3.7/site-packages (from flaml) (3.3.0)
Requirement already satisfied: scikit-learn>=0.24 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from flaml) (1.0)
Requirement already satisfied: NumPy>=1.16.2 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from flaml) (1.19.5)
Requirement already satisfied: xgboost<=1.3.3,>=0.90 in /root/venv/lib/python3.7/site-packages (from flaml) (1.3.3)
Requirement already satisfied: pandas>=1.1.4 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from flaml) (1.2.5)
Requirement already satisfied: wheel in /root/venv/lib/python3.7/site-packages (from lightgbm>=2.3.1->flaml) (0.37.0)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=1.1.4->flaml) (2021.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=1.1.4->flaml) (2.8.2)
Requirement already satisfied: six>=1.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas>=1.1.4->flaml) (1.16.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn>=0.24->flaml) (3.0.0)
Requirement already satisfied: joblib>=0.11 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn>=0.24->flaml) (1.1.0)
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
settings = {
"time_budget": 100, # total running time in seconds
"metric": 'r2', # primary metrics for regression can be chosen from: ['mae','mse','r2']
"estimator_list": ['lgbm'], # list of ML learners; we tune lightgbm in this example
"task": 'regression', # task type
"log_file_name": 'UK_used_car_price.log', # flaml log file
}
'''The main flaml automl API'''
automl.fit(X_train=X_train, y_train=y_train, **settings)
[flaml.automl: 10-22 16:49:35] {1463} INFO - Data split method: uniform
[flaml.automl: 10-22 16:49:35] {1467} INFO - Evaluation method: cv
[flaml.automl: 10-22 16:49:35] {1515} INFO - Minimizing error metric: 1-r2
[flaml.automl: 10-22 16:49:35] {1552} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl: 10-22 16:49:35] {1793} INFO - iteration 0, current learner lgbm
[flaml.automl: 10-22 16:49:36] {1911} INFO - Estimated sufficient time budget=2025s. Estimated necessary time budget=2s.
[flaml.automl: 10-22 16:49:36] {1987} INFO - at 0.3s, estimator lgbm's best error=0.6505, best estimator lgbm's best error=0.6505
[flaml.automl: 10-22 16:49:36] {1793} INFO - iteration 1, current learner lgbm
[flaml.automl: 10-22 16:49:36] {1987} INFO - at 0.5s, estimator lgbm's best error=0.6505, best estimator lgbm's best error=0.6505
[flaml.automl: 10-22 16:49:36] {1793} INFO - iteration 2, current learner lgbm
[flaml.automl: 10-22 16:49:36] {1987} INFO - at 0.7s, estimator lgbm's best error=0.3718, best estimator lgbm's best error=0.3718
[flaml.automl: 10-22 16:49:36] {1793} INFO - iteration 3, current learner lgbm
[flaml.automl: 10-22 16:49:36] {1987} INFO - at 1.0s, estimator lgbm's best error=0.1586, best estimator lgbm's best error=0.1586
[flaml.automl: 10-22 16:49:36] {1793} INFO - iteration 4, current learner lgbm
[flaml.automl: 10-22 16:49:37] {1987} INFO - at 1.2s, estimator lgbm's best error=0.1586, best estimator lgbm's best error=0.1586
[flaml.automl: 10-22 16:49:37] {1793} INFO - iteration 5, current learner lgbm
[flaml.automl: 10-22 16:49:37] {1987} INFO - at 1.4s, estimator lgbm's best error=0.1483, best estimator lgbm's best error=0.1483
[flaml.automl: 10-22 16:49:37] {1793} INFO - iteration 6, current learner lgbm
[flaml.automl: 10-22 16:49:37] {1987} INFO - at 1.7s, estimator lgbm's best error=0.1483, best estimator lgbm's best error=0.1483
[flaml.automl: 10-22 16:49:37] {1793} INFO - iteration 7, current learner lgbm
[flaml.automl: 10-22 16:49:37] {1987} INFO - at 1.9s, estimator lgbm's best error=0.1483, best estimator lgbm's best error=0.1483
[flaml.automl: 10-22 16:49:37] {1793} INFO - iteration 8, current learner lgbm
[flaml.automl: 10-22 16:49:38] {1987} INFO - at 2.2s, estimator lgbm's best error=0.1258, best estimator lgbm's best error=0.1258
[flaml.automl: 10-22 16:49:38] {1793} INFO - iteration 9, current learner lgbm
[flaml.automl: 10-22 16:49:38] {1987} INFO - at 2.4s, estimator lgbm's best error=0.1258, best estimator lgbm's best error=0.1258
[flaml.automl: 10-22 16:49:38] {1793} INFO - iteration 10, current learner lgbm
[flaml.automl: 10-22 16:49:38] {1987} INFO - at 2.9s, estimator lgbm's best error=0.0875, best estimator lgbm's best error=0.0875
[flaml.automl: 10-22 16:49:38] {1793} INFO - iteration 11, current learner lgbm
[flaml.automl: 10-22 16:49:39] {1987} INFO - at 3.4s, estimator lgbm's best error=0.0875, best estimator lgbm's best error=0.0875
[flaml.automl: 10-22 16:49:39] {1793} INFO - iteration 12, current learner lgbm
[flaml.automl: 10-22 16:49:39] {1987} INFO - at 3.8s, estimator lgbm's best error=0.0875, best estimator lgbm's best error=0.0875
[flaml.automl: 10-22 16:49:39] {1793} INFO - iteration 13, current learner lgbm
[flaml.automl: 10-22 16:49:40] {1987} INFO - at 4.3s, estimator lgbm's best error=0.0875, best estimator lgbm's best error=0.0875
[flaml.automl: 10-22 16:49:40] {1793} INFO - iteration 14, current learner lgbm
[flaml.automl: 10-22 16:49:40] {1987} INFO - at 4.9s, estimator lgbm's best error=0.0875, best estimator lgbm's best error=0.0875
[flaml.automl: 10-22 16:49:40] {1793} INFO - iteration 15, current learner lgbm
[flaml.automl: 10-22 16:49:41] {1987} INFO - at 5.5s, estimator lgbm's best error=0.0875, best estimator lgbm's best error=0.0875
[flaml.automl: 10-22 16:49:41] {1793} INFO - iteration 16, current learner lgbm
[flaml.automl: 10-22 16:49:41] {1987} INFO - at 5.9s, estimator lgbm's best error=0.0875, best estimator lgbm's best error=0.0875
[flaml.automl: 10-22 16:49:41] {1793} INFO - iteration 17, current learner lgbm
[flaml.automl: 10-22 16:49:42] {1987} INFO - at 6.6s, estimator lgbm's best error=0.0841, best estimator lgbm's best error=0.0841
[flaml.automl: 10-22 16:49:42] {1793} INFO - iteration 18, current learner lgbm
[flaml.automl: 10-22 16:49:42] {1987} INFO - at 7.0s, estimator lgbm's best error=0.0841, best estimator lgbm's best error=0.0841
[flaml.automl: 10-22 16:49:42] {1793} INFO - iteration 19, current learner lgbm
[flaml.automl: 10-22 16:49:43] {1987} INFO - at 7.5s, estimator lgbm's best error=0.0841, best estimator lgbm's best error=0.0841
[flaml.automl: 10-22 16:49:43] {1793} INFO - iteration 20, current learner lgbm
[flaml.automl: 10-22 16:49:44] {1987} INFO - at 8.8s, estimator lgbm's best error=0.0841, best estimator lgbm's best error=0.0841
[flaml.automl: 10-22 16:49:44] {1793} INFO - iteration 21, current learner lgbm
[flaml.automl: 10-22 16:49:45] {1987} INFO - at 9.5s, estimator lgbm's best error=0.0838, best estimator lgbm's best error=0.0838
[flaml.automl: 10-22 16:49:45] {1793} INFO - iteration 22, current learner lgbm
[flaml.automl: 10-22 16:49:46] {1987} INFO - at 10.2s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:46] {1793} INFO - iteration 23, current learner lgbm
[flaml.automl: 10-22 16:49:46] {1987} INFO - at 10.9s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:46] {1793} INFO - iteration 24, current learner lgbm
[flaml.automl: 10-22 16:49:47] {1987} INFO - at 11.7s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:47] {1793} INFO - iteration 25, current learner lgbm
[flaml.automl: 10-22 16:49:48] {1987} INFO - at 13.1s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:48] {1793} INFO - iteration 26, current learner lgbm
[flaml.automl: 10-22 16:49:49] {1987} INFO - at 13.5s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:49] {1793} INFO - iteration 27, current learner lgbm
[flaml.automl: 10-22 16:49:50] {1987} INFO - at 14.6s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:50] {1793} INFO - iteration 28, current learner lgbm
[flaml.automl: 10-22 16:49:50] {1987} INFO - at 15.1s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:50] {1793} INFO - iteration 29, current learner lgbm
[flaml.automl: 10-22 16:49:51] {1987} INFO - at 15.4s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:51] {1793} INFO - iteration 30, current learner lgbm
[flaml.automl: 10-22 16:49:52] {1987} INFO - at 17.0s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:52] {1793} INFO - iteration 31, current learner lgbm
[flaml.automl: 10-22 16:49:53] {1987} INFO - at 17.6s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:53] {1793} INFO - iteration 32, current learner lgbm
[flaml.automl: 10-22 16:49:54] {1987} INFO - at 18.5s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:54] {1793} INFO - iteration 33, current learner lgbm
[flaml.automl: 10-22 16:49:54] {1987} INFO - at 19.1s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:54] {1793} INFO - iteration 34, current learner lgbm
[flaml.automl: 10-22 16:49:55] {1987} INFO - at 19.7s, estimator lgbm's best error=0.0817, best estimator lgbm's best error=0.0817
[flaml.automl: 10-22 16:49:55] {1793} INFO - iteration 35, current learner lgbm
[flaml.automl: 10-22 16:49:57] {1987} INFO - at 21.2s, estimator lgbm's best error=0.0810, best estimator lgbm's best error=0.0810
[flaml.automl: 10-22 16:49:57] {1793} INFO - iteration 36, current learner lgbm
[flaml.automl: 10-22 16:49:57] {1987} INFO - at 21.8s, estimator lgbm's best error=0.0810, best estimator lgbm's best error=0.0810
[flaml.automl: 10-22 16:49:57] {1793} INFO - iteration 37, current learner lgbm
[flaml.automl: 10-22 16:50:00] {1987} INFO - at 24.9s, estimator lgbm's best error=0.0810, best estimator lgbm's best error=0.0810
[flaml.automl: 10-22 16:50:00] {1793} INFO - iteration 38, current learner lgbm
[flaml.automl: 10-22 16:50:02] {1987} INFO - at 26.4s, estimator lgbm's best error=0.0809, best estimator lgbm's best error=0.0809
[flaml.automl: 10-22 16:50:02] {1793} INFO - iteration 39, current learner lgbm
[flaml.automl: 10-22 16:50:10] {1987} INFO - at 34.9s, estimator lgbm's best error=0.0809, best estimator lgbm's best error=0.0809
[flaml.automl: 10-22 16:50:10] {1793} INFO - iteration 40, current learner lgbm
[flaml.automl: 10-22 16:50:11] {1987} INFO - at 35.4s, estimator lgbm's best error=0.0809, best estimator lgbm's best error=0.0809
[flaml.automl: 10-22 16:50:11] {1793} INFO - iteration 41, current learner lgbm
[flaml.automl: 10-22 16:50:11] {1987} INFO - at 35.9s, estimator lgbm's best error=0.0809, best estimator lgbm's best error=0.0809
[flaml.automl: 10-22 16:50:11] {1793} INFO - iteration 42, current learner lgbm
[flaml.automl: 10-22 16:50:19] {1987} INFO - at 43.6s, estimator lgbm's best error=0.0809, best estimator lgbm's best error=0.0809
[flaml.automl: 10-22 16:50:19] {1793} INFO - iteration 43, current learner lgbm
[flaml.automl: 10-22 16:50:23] {1987} INFO - at 47.6s, estimator lgbm's best error=0.0809, best estimator lgbm's best error=0.0809
[flaml.automl: 10-22 16:50:23] {1793} INFO - iteration 44, current learner lgbm
[flaml.automl: 10-22 16:50:24] {1987} INFO - at 48.3s, estimator lgbm's best error=0.0800, best estimator lgbm's best error=0.0800
[flaml.automl: 10-22 16:50:24] {1793} INFO - iteration 45, current learner lgbm
[flaml.automl: 10-22 16:50:24] {1987} INFO - at 48.7s, estimator lgbm's best error=0.0800, best estimator lgbm's best error=0.0800
[flaml.automl: 10-22 16:50:24] {1793} INFO - iteration 46, current learner lgbm
[flaml.automl: 10-22 16:50:27] {1987} INFO - at 51.9s, estimator lgbm's best error=0.0800, best estimator lgbm's best error=0.0800
[flaml.automl: 10-22 16:50:27] {1793} INFO - iteration 47, current learner lgbm
[flaml.automl: 10-22 16:50:28] {1987} INFO - at 52.3s, estimator lgbm's best error=0.0800, best estimator lgbm's best error=0.0800
[flaml.automl: 10-22 16:50:28] {1793} INFO - iteration 48, current learner lgbm
[flaml.automl: 10-22 16:50:31] {1987} INFO - at 55.4s, estimator lgbm's best error=0.0800, best estimator lgbm's best error=0.0800
[flaml.automl: 10-22 16:50:31] {1793} INFO - iteration 49, current learner lgbm
[flaml.automl: 10-22 16:50:31] {1987} INFO - at 55.7s, estimator lgbm's best error=0.0800, best estimator lgbm's best error=0.0800
[flaml.automl: 10-22 16:50:31] {1793} INFO - iteration 50, current learner lgbm
[flaml.automl: 10-22 16:50:35] {1987} INFO - at 60.1s, estimator lgbm's best error=0.0800, best estimator lgbm's best error=0.0800
[flaml.automl: 10-22 16:50:35] {1793} INFO - iteration 51, current learner lgbm
[flaml.automl: 10-22 16:50:38] {1987} INFO - at 62.4s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:50:38] {1793} INFO - iteration 52, current learner lgbm
[flaml.automl: 10-22 16:50:46] {1987} INFO - at 70.2s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:50:46] {1793} INFO - iteration 53, current learner lgbm
[flaml.automl: 10-22 16:50:46] {1987} INFO - at 70.9s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:50:46] {1793} INFO - iteration 54, current learner lgbm
[flaml.automl: 10-22 16:50:48] {1987} INFO - at 73.0s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:50:48] {1793} INFO - iteration 55, current learner lgbm
[flaml.automl: 10-22 16:50:51] {1987} INFO - at 76.1s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:50:51] {1793} INFO - iteration 56, current learner lgbm
[flaml.automl: 10-22 16:50:54] {1987} INFO - at 78.6s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:50:54] {1793} INFO - iteration 57, current learner lgbm
[flaml.automl: 10-22 16:50:56] {1987} INFO - at 80.9s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:50:56] {1793} INFO - iteration 58, current learner lgbm
[flaml.automl: 10-22 16:50:57] {1987} INFO - at 82.0s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:50:57] {1793} INFO - iteration 59, current learner lgbm
[flaml.automl: 10-22 16:51:01] {1987} INFO - at 85.9s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:51:01] {1793} INFO - iteration 60, current learner lgbm
[flaml.automl: 10-22 16:51:10] {1987} INFO - at 94.3s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:51:10] {1793} INFO - iteration 61, current learner lgbm
[flaml.automl: 10-22 16:51:10] {1987} INFO - at 94.7s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:51:10] {1793} INFO - iteration 62, current learner lgbm
[flaml.automl: 10-22 16:51:13] {1987} INFO - at 97.5s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:51:13] {1793} INFO - iteration 63, current learner lgbm
[flaml.automl: 10-22 16:51:13] {1987} INFO - at 98.0s, estimator lgbm's best error=0.0776, best estimator lgbm's best error=0.0776
[flaml.automl: 10-22 16:51:13] {2087} INFO - selected model: LGBMRegressor(colsample_bytree=0.9387926402887784,
learning_rate=0.07462243384581825, max_bin=511,
min_child_samples=3, n_estimators=545, num_leaves=18,
reg_alpha=0.10333698843991193, reg_lambda=0.004913466630336639,
verbose=-1)
[flaml.automl: 10-22 16:51:14] {2151} INFO - retrain lgbm for 0.5s
[flaml.automl: 10-22 16:51:14] {2155} INFO - retrained model: LGBMRegressor(colsample_bytree=0.9387926402887784,
learning_rate=0.07462243384581825, max_bin=511,
min_child_samples=3, n_estimators=545, num_leaves=18,
reg_alpha=0.10333698843991193, reg_lambda=0.004913466630336639,
verbose=-1)
[flaml.automl: 10-22 16:51:14] {1576} INFO - fit succeeded
[flaml.automl: 10-22 16:51:14] {1578} INFO - Time taken to find the best model: 62.44079804420471
automl.model
''' compute predictions of testing dataset '''
y_pred = automl.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
Predicted labels [ 7666.86109386 15330.85711618 8270.28789868 ... 8673.77484936
12713.23316695 17874.71304822]
True labels 1989 8311
985 15991
3763 8998
237 18148
4926 11490
...
2690 6390
946 18000
5423 9689
5177 11975
5949 17000
Name: price, Length: 1809, dtype: int64
''' compute different metric values on testing dataset'''
from flaml.ml import sklearn_metric_loss_score
print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))
print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))
r2 = 0.8911677265464114
mse = 8815374.800960746
mae = 1460.6715680772309
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
get_output_from_log(filename=settings['log_file_name'], time_budget=60)
for config in config_history:
print(config)
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 20, 'learning_rate': 0.09999999999999995, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.0}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 20, 'learning_rate': 0.09999999999999995, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.0}}
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 12, 'learning_rate': 0.26770501231052046, 'log_max_bin': 7, 'colsample_bytree': 1.0, 'reg_alpha': 0.001348364934537134, 'reg_lambda': 1.4442580148221913}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 12, 'learning_rate': 0.26770501231052046, 'log_max_bin': 7, 'colsample_bytree': 1.0, 'reg_alpha': 0.001348364934537134, 'reg_lambda': 1.4442580148221913}}
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 9, 'num_leaves': 4, 'min_child_samples': 9, 'learning_rate': 0.7260594590615893, 'log_max_bin': 9, 'colsample_bytree': 0.9285002286474459, 'reg_alpha': 0.0036840681931986645, 'reg_lambda': 0.7532480505730402}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 9, 'num_leaves': 4, 'min_child_samples': 9, 'learning_rate': 0.7260594590615893, 'log_max_bin': 9, 'colsample_bytree': 0.9285002286474459, 'reg_alpha': 0.0036840681931986645, 'reg_lambda': 0.7532480505730402}}
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 10, 'num_leaves': 5, 'min_child_samples': 5, 'learning_rate': 0.7590459488450945, 'log_max_bin': 8, 'colsample_bytree': 0.8304072431299575, 'reg_alpha': 0.001951378031519758, 'reg_lambda': 0.04792552866398477}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 10, 'num_leaves': 5, 'min_child_samples': 5, 'learning_rate': 0.7590459488450945, 'log_max_bin': 8, 'colsample_bytree': 0.8304072431299575, 'reg_alpha': 0.001951378031519758, 'reg_lambda': 0.04792552866398477}}
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 28, 'num_leaves': 4, 'min_child_samples': 4, 'learning_rate': 0.41929025492645006, 'log_max_bin': 8, 'colsample_bytree': 0.7610534336273627, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.009280655005879927}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 28, 'num_leaves': 4, 'min_child_samples': 4, 'learning_rate': 0.41929025492645006, 'log_max_bin': 8, 'colsample_bytree': 0.7610534336273627, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.009280655005879927}}
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 75, 'num_leaves': 14, 'min_child_samples': 3, 'learning_rate': 0.17402065726724145, 'log_max_bin': 8, 'colsample_bytree': 0.6649148062238498, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.006761362450996487}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 75, 'num_leaves': 14, 'min_child_samples': 3, 'learning_rate': 0.17402065726724145, 'log_max_bin': 8, 'colsample_bytree': 0.6649148062238498, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.006761362450996487}}
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 49, 'num_leaves': 43, 'min_child_samples': 2, 'learning_rate': 0.1530612501227463, 'log_max_bin': 10, 'colsample_bytree': 0.7463308378914483, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.012698515198279517}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 49, 'num_leaves': 43, 'min_child_samples': 2, 'learning_rate': 0.1530612501227463, 'log_max_bin': 10, 'colsample_bytree': 0.7463308378914483, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.012698515198279517}}
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 84, 'num_leaves': 24, 'min_child_samples': 2, 'learning_rate': 0.18908812295671118, 'log_max_bin': 10, 'colsample_bytree': 0.7965188503204358, 'reg_alpha': 0.004577823970660193, 'reg_lambda': 0.02722880765786953}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 84, 'num_leaves': 24, 'min_child_samples': 2, 'learning_rate': 0.18908812295671118, 'log_max_bin': 10, 'colsample_bytree': 0.7965188503204358, 'reg_alpha': 0.004577823970660193, 'reg_lambda': 0.02722880765786953}}
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 49, 'num_leaves': 44, 'min_child_samples': 5, 'learning_rate': 0.1530612501227464, 'log_max_bin': 9, 'colsample_bytree': 0.7463308378914483, 'reg_alpha': 0.0009765625000000002, 'reg_lambda': 0.012698515198279517}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 49, 'num_leaves': 44, 'min_child_samples': 5, 'learning_rate': 0.1530612501227464, 'log_max_bin': 9, 'colsample_bytree': 0.7463308378914483, 'reg_alpha': 0.0009765625000000002, 'reg_lambda': 0.012698515198279517}}
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 186, 'num_leaves': 54, 'min_child_samples': 4, 'learning_rate': 0.23158746785935408, 'log_max_bin': 8, 'colsample_bytree': 0.7577916169254649, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.03188419400042777}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 186, 'num_leaves': 54, 'min_child_samples': 4, 'learning_rate': 0.23158746785935408, 'log_max_bin': 8, 'colsample_bytree': 0.7577916169254649, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.03188419400042777}}
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 342, 'num_leaves': 49, 'min_child_samples': 7, 'learning_rate': 0.1180103887802835, 'log_max_bin': 9, 'colsample_bytree': 0.7407378789573081, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.11929523168059925}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 342, 'num_leaves': 49, 'min_child_samples': 7, 'learning_rate': 0.1180103887802835, 'log_max_bin': 9, 'colsample_bytree': 0.7407378789573081, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.11929523168059925}}
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 186, 'num_leaves': 54, 'min_child_samples': 4, 'learning_rate': 0.23158746785935408, 'log_max_bin': 8, 'colsample_bytree': 0.7577916169254649, 'reg_alpha': 0.008948061409181867, 'reg_lambda': 0.03188419400042777}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 186, 'num_leaves': 54, 'min_child_samples': 4, 'learning_rate': 0.23158746785935408, 'log_max_bin': 8, 'colsample_bytree': 0.7577916169254649, 'reg_alpha': 0.008948061409181867, 'reg_lambda': 0.03188419400042777}}
{'Current Learner': 'lgbm', 'Current Sample': 4221, 'Current Hyper-parameters': {'n_estimators': 119, 'num_leaves': 27, 'min_child_samples': 3, 'learning_rate': 0.17571044190405946, 'log_max_bin': 8, 'colsample_bytree': 0.9185336079509137, 'reg_alpha': 0.12438679004009143, 'reg_lambda': 0.01900130015865831}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 119, 'num_leaves': 27, 'min_child_samples': 3, 'learning_rate': 0.17571044190405946, 'log_max_bin': 8, 'colsample_bytree': 0.9185336079509137, 'reg_alpha': 0.12438679004009143, 'reg_lambda': 0.01900130015865831}}
plt.title('Learning Curve')
plt.xlabel('Wall Clock Time (s)')
plt.ylabel('Validation r2')
plt.scatter(time_history, 1 - np.array(valid_loss_history))
plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
plt.show()
automl.best_iteration
print('flaml r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
flaml r2 = 0.8911677265464114
import lightgbm as lgb
print(f'lightgbm version = {lgb.__version__}')
lightgbm version = 3.3.0
params={'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 1.0,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': -1,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 100,
'n_jobs': -1,
'num_leaves': 31,
'objective': None,
'random_state': None,
'reg_alpha': 5.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 1.0,
'subsample_for_bin': 200000,
'subsample_freq': 0
}
# reg = lgb.LGBMRegressor()
lgbm = lgb.LGBMRegressor(**params)
fit_params={'early_stopping_rounds':100,
'eval_set': [(X_test, y_test)],
'verbose': 10,
}
lgbm.fit(X_train, y_train, **fit_params) # with early_stopping and lgb.plot_metric
[10] valid_0's l2: 2.29424e+07
[20] valid_0's l2: 1.34804e+07
[30] valid_0's l2: 1.12093e+07
[40] valid_0's l2: 1.05607e+07
[50] valid_0's l2: 1.0238e+07
[60] valid_0's l2: 1.013e+07
[70] valid_0's l2: 1.00044e+07
[80] valid_0's l2: 9.9589e+06
[90] valid_0's l2: 9.93005e+06
[100] valid_0's l2: 9.88753e+06
y_pred = lgbm.predict(X_test)
from flaml.ml import sklearn_metric_loss_score
print('default lgbm r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
default lgbm r2 = 0.8779801320818768
lgb.plot_metric(lgbm) #หลังจากเทรนด์ไปประมาณ 20 รอบ l2 ที่ได้ก็เริ่มคงที่
lgbm.score(X_train, y_train)
lgbm.predict(X_test[:5])
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5, test_size=0.2)
scores = cross_val_score(lgbm, X, y, cv=cv)
print(scores)
print(f'mean scores = {scores.mean()}, sd={scores.std():.4f}')
[0.84913259 0.90094793 0.8444652 0.92274986 0.92563647]
mean scores = 0.8885864095521896, sd=0.0352
!pip install -U graphviz
show_info=['split_gain', 'internal_value', 'data_percentage', 'leaf_count']
orientation='vertical'
max_num_trees=lgbm.__dict__['_Booster'].num_trees()
w_tree_index=widgets.BoundedIntText(
value=0,
min=0,
max=max_num_trees-1,
step=1,
continuous_update=True,
)
w_max_depth=widgets.BoundedIntText(
value=3,
min=2,
max=10,
step=1,
continuous_update=True,
)
w_n_estimators=widgets.BoundedIntText(
value=3,
min=1,
max=100,
step=1,
continuous_update=True,
)
@interact
def plot_tree(tree_index=w_tree_index, orientation=['vertical', 'horizontal'], max_depth=w_max_depth, n_estimators=w_n_estimators, save_tree_img=[False, True]):
params={
'subsample': 1.0,
'n_estimators': n_estimators,
'max_depth': max_depth
}
case_index=1
print(X_test.iloc[[case_index]])
print(f'predicted value = {lgbm.predict(X_test.iloc[[case_index]])}')
w_tree_index.max=w_n_estimators.value-1
g=lgb.create_tree_digraph(lgbm, orientation=orientation, tree_index=tree_index, show_info=show_info, precision=4)
if save_tree_img:
g.format='png'
g.render(f'tree{tree_index}', view=False, cleanup=True)
return g
Unsupported output type: clearOutput
year transmission fuelType engineSize mfg mileage_km kml
985 2016.0 Manual Diesel 2.0 bmw 64240.02 25.04
predicted value = [15583.23850216]
Execution error
ImportError: You must install graphviz and restart your session to plot tree.
!pip install shap
import shap
print(f'shap version {shap.__version__}')
# load JS visualization code to notebook
shap.initjs()
Collecting shap
Downloading shap-0.40.0.tar.gz (371 kB)
|████████████████████████████████| 371 kB 32.3 MB/s
Installing build dependencies ... done
Getting requirements to build wheel ... done
Preparing wheel metadata ... done
Requirement already satisfied: numpy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from shap) (1.19.5)
Requirement already satisfied: pandas in /shared-libs/python3.7/py/lib/python3.7/site-packages (from shap) (1.2.5)
Collecting cloudpickle
Downloading cloudpickle-2.0.0-py3-none-any.whl (25 kB)
Collecting numba
Downloading numba-0.54.1-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.3 MB)
|████████████████████████████████| 3.3 MB 36.9 MB/s
Requirement already satisfied: scipy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from shap) (1.7.1)
Requirement already satisfied: scikit-learn in /shared-libs/python3.7/py/lib/python3.7/site-packages (from shap) (1.0)
Requirement already satisfied: packaging>20.9 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from shap) (21.0)
Collecting slicer==0.0.7
Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Requirement already satisfied: tqdm>4.25.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from shap) (4.62.3)
Requirement already satisfied: pyparsing>=2.0.2 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from packaging>20.9->shap) (2.4.7)
Requirement already satisfied: setuptools in /root/venv/lib/python3.7/site-packages (from numba->shap) (58.1.0)
Collecting llvmlite<0.38,>=0.37.0rc1
Downloading llvmlite-0.37.0-cp37-cp37m-manylinux2014_x86_64.whl (26.3 MB)
|████████████████████████████████| 26.3 MB 39.2 MB/s
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas->shap) (2021.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas->shap) (2.8.2)
Requirement already satisfied: six>=1.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas->shap) (1.16.0)
Requirement already satisfied: joblib>=0.11 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->shap) (1.1.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->shap) (3.0.0)
Building wheels for collected packages: shap
Building wheel for shap (PEP 517) ... done
Created wheel for shap: filename=shap-0.40.0-cp37-cp37m-linux_x86_64.whl size=433707 sha256=f83c3312b4eff4aa8ac37a62b2f563695c210b6ff34047da2439d38639cb7dad
Stored in directory: /root/.cache/pip/wheels/ec/35/84/e304841ac4b910bc95fe9a6e5302eb2507b4333728851dcbfb
Successfully built shap
Installing collected packages: llvmlite, slicer, numba, cloudpickle, shap
Successfully installed cloudpickle-2.0.0 llvmlite-0.37.0 numba-0.54.1 shap-0.40.0 slicer-0.0.7
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
shap version 0.40.0
def case_detail(case_data):
'''
format obj returned from shap.force_plot()
'''
de=pd.DataFrame(case_data.data['features'])
fcols=[]
for i in case_data.data['features'].keys():
fcols.append(case_data.data['featureNames'][i])
de.columns=fcols
return de
def individual_case_plot(explainer, X, case_index, verbose=False):
"""
>>> individual_case_plot(explainer, X_train, 1)
"""
shap_values = explainer.shap_values(X.iloc[[case_index]])
g=shap.force_plot(explainer.expected_value, shap_values=shap_values, features=X.iloc[case_index, :])
if verbose:
pprint(g.__dict__)
return g
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X)
X
shap_values[:3]
dshap=pd.DataFrame(shap_values, columns=feature_cols)
dshap
# mean(abs(SHAP value)): average impact on model output magnitude
feature_imp = np.abs(dshap).mean().sort_values(ascending=False)
pd.DataFrame(feature_imp).style.background_gradient(cmap='Blues')
shap.summary_plot(dshap, X, plot_type="bar")
# summarize the effects of all the features
shap.summary_plot(shap_values, X)