!pip install lightgbm
Collecting lightgbm
Downloading lightgbm-3.2.1-py3-none-win_amd64.whl (1.0 MB)
Requirement already satisfied: scipy in c:\users\jgaur\anaconda3\lib\site-packages (from lightgbm) (1.5.2)
Requirement already satisfied: scikit-learn!=0.22.0 in c:\users\jgaur\anaconda3\lib\site-packages (from lightgbm) (0.24.1)
Requirement already satisfied: numpy in c:\users\jgaur\anaconda3\lib\site-packages (from lightgbm) (1.19.2)
Requirement already satisfied: wheel in c:\users\jgaur\anaconda3\lib\site-packages (from lightgbm) (0.35.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\jgaur\anaconda3\lib\site-packages (from scikit-learn!=0.22.0->lightgbm) (2.1.0)
Requirement already satisfied: joblib>=0.11 in c:\users\jgaur\anaconda3\lib\site-packages (from scikit-learn!=0.22.0->lightgbm) (0.17.0)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.2.1
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from contextlib import contextmanager
from time import time
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
sns.set_style('darkgrid')
warnings.filterwarnings('ignore')
''' reading dataset '''
df = pd.read_csv('Placement_Data_Full_Class.csv')
''' displaying first 5 rows '''
df.head()
''' shape of data '''
df.shape
''' checking null values '''
df.isnull().sum()
''' checking info of data '''
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 sl_no 215 non-null int64
1 gender 215 non-null object
2 ssc_p 215 non-null float64
3 ssc_b 215 non-null object
4 hsc_p 215 non-null float64
5 hsc_b 215 non-null object
6 hsc_s 215 non-null object
7 degree_p 215 non-null float64
8 degree_t 215 non-null object
9 workex 215 non-null object
10 etest_p 215 non-null float64
11 specialisation 215 non-null object
12 mba_p 215 non-null float64
13 status 215 non-null object
14 salary 148 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB
df.head()
''' count of gender '''
gender_label = df.gender.value_counts()
''' plotting count '''
plt.figure(figsize=(10, 5))
sns.barplot(gender_label.index, gender_label);
plt.xlabel('gender', fontsize=20)
plt.ylabel('count', fontsize=20)
plt.title('gender count', fontsize=20);
''' count of degrees '''
df.degree_t.value_counts()
''' count of specialiation '''
df.specialisation.value_counts()
''' count of status '''
df.status.value_counts()
''' chekcing status '''
plt.figure(figsize=(8,5))
sns.countplot(x='status', hue='gender', data=df);
df_ = df
for column in df_.columns:
if df_[column].dtype != 'object':
''' filling null values with 0 '''
df_[column] = df_[column].fillna(0)
''' transpose df_ '''
df_[0:2].T
for column in df_.columns:
if df_[column].dtype=='object':
''' filling null values in N '''
df_[column] = df_[column].fillna('N')
''' label encoder '''
lbl_enc = LabelEncoder()
lbl_enc.fit(list(df_[column].values))
df_[column] = lbl_enc.transform(df_[column].values)
df1 = df_
''' transpose df1 '''
df1[0:2].T
''' dependent variable and independent variables '''
y_df1 = df1[['status', 'salary']]
''' dropping status and salary from df1 '''
X_df1 = df1.drop(['status', 'salary'], axis=1)
''' transpose y_df1 and X_df1 '''
print(y_df1[0:2].T)
print("-" * 100)
print(X_df1[0:2].T)
0 1
status 1.0 1.0
salary 270000.0 200000.0
----------------------------------------------------------------------------------------------------
0 1
sl_no 1.0 2.00
gender 1.0 1.00
ssc_p 67.0 79.33
ssc_b 1.0 0.00
hsc_p 91.0 78.33
hsc_b 1.0 1.00
hsc_s 1.0 2.00
degree_p 58.0 77.48
degree_t 2.0 2.00
workex 0.0 1.00
etest_p 55.0 86.50
specialisation 1.0 0.00
mba_p 58.8 66.28
''' converting it into numpy array '''
X_train = np.array(X_df1)
y_train = np.array(y_df1)
''' shape of X_train and y_train'''
print(X_train.shape)
print(y_train.shape)
(215, 13)
(215, 2)
columns = list(X_df1.columns)
columns
''' converting X_train into DataFrame '''
X_train = pd.DataFrame(X_train, columns=columns)
''' displaying X_train data '''
X_train.head()
''' creating a function create_nf '''
def create_nf(inp_df):
use_cols = columns
return inp_df[use_cols].copy()
''' creating class Timer '''
class Timer:
''' constructor '''
def __init__(self, logger=None, frmt_str='{:.3f}[s]', prefix=None, suffix=None, sep=' '):
if prefix: frmt_str = str(prefix) + sep + frmt_str
if suffix: frmt_str = frmt_str + sep + str(suffix)
''' instance variables '''
self.frmt_str = frmt_str
self.logger = logger
self.start = None
self.end = None
@property
def duration(self):
if self.end is None:
return 0
return self.end - self.start
def __enter__(self):
self.start = time()
def __exit__(self, exc_type, exc_val, exc_tb):
self.end = time()
out_str = self.frmt_str.format(self.duration)
if self.logger:
self.logger.info(out_str)
else:
print(out_str)
def to_feature(inp_df):
p = [create_nf,]
'''creating dataframe'''
df = pd.DataFrame()
for func in tqdm(p, total=len(p)):
with Timer(prefix='create' + func.__name__ + ' '):
df1 = func(inp_df)
assert len(df1) == len(inp_df), func.__name__
output_df = pd.concat([df, df1], axis=1)
return output_df
X_train_result = to_feature(X_train)
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 166.65it/s]
createcreate_nf 0.002[s]
''' creating fit function '''
def fit(X, y, cv, params: dict=None, verbose: int=50):
if params is None:
params = {}
m = []
ypred = np.zeros_like(y, dtype=np.float)
for i, (t_idx, v_idx) in enumerate(cv):
X_train, y_train = X[t_idx], y[t_idx]
X_test, y_test = X[v_idx], y[v_idx]
''' model '''
lg = lgbm.LGBMRegressor(**params)
with Timer(prefix='fit fold={} '.format(i)):
''' training '''
lg.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=verbose)
''' prediction '''
pred = lg.predict(X_test)
ypred[v_idx] = pred
m.append(lg)
print(f'Fold {i} RMSLE: {mean_squared_error(y_test, pred) ** .5:.4f}')
print()
result_score = mean_squared_error(y, ypred) ** .5
print('-' * 50)
print('RMSE: {:.4f}'.format(result_score))
return ypred, m
p = {
'objective': 'rmse',
'learning_rate': .1,
'reg_lambda': 1.,
'reg_alpha': .1,
'max_depth': 5,
'n_estimators': 10000,
'colsample_bytree': .5,
'min_child_samples': 10,
'subsample_freq': 3,
'subsample': .9,
'importance_type': 'gain',
'random_state': 71,
'num_leaves': 62
}
''' creating dataframe of y_train '''
y_train = pd.DataFrame(y_train)
y_train.head()
target = ['status', 'salary']
for i in range(2):
''' cross-validation '''
fold = KFold(n_splits=5, shuffle=True, random_state=71)
y_train_i = y_train.iloc[:,i]
''' converting into numpy array '''
y = np.array(y_train_i)
cv = list(fold.split(X_train_result, y))
ypred, models = fit(X_train_result.values, y, cv, params=p, verbose=500)
fig,ax = plt.subplots(figsize=(6,6))
ax.set_title(target[i],fontsize=20)
ax.set_xlabel('oof '+str(i),fontsize=12)
ax.set_ylabel('train_y '+str(i),fontsize=12)
ax.scatter(ypred,y)
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[27] valid_0's rmse: 0.328506
fit fold=0 0.130[s]
Fold 0 RMSLE: 0.3285
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[225] valid_0's rmse: 0.30133
fit fold=1 0.284[s]
Fold 1 RMSLE: 0.3013
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[41] valid_0's rmse: 0.28941
fit fold=2 0.148[s]
Fold 2 RMSLE: 0.2894
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[24] valid_0's rmse: 0.357566
fit fold=3 0.101[s]
Fold 3 RMSLE: 0.3576
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[57] valid_0's rmse: 0.301637
fit fold=4 0.131[s]
Fold 4 RMSLE: 0.3016
--------------------------------------------------
RMSE: 0.3166
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[15] valid_0's rmse: 100571
fit fold=0 0.092[s]
Fold 0 RMSLE: 100571.0590
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[40] valid_0's rmse: 114054
fit fold=1 0.130[s]
Fold 1 RMSLE: 114054.0526
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42] valid_0's rmse: 114996
fit fold=2 0.134[s]
Fold 2 RMSLE: 114995.8491
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[19] valid_0's rmse: 166836
fit fold=3 0.184[s]
Fold 3 RMSLE: 166836.1341
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42] valid_0's rmse: 113195
fit fold=4 0.167[s]
Fold 4 RMSLE: 113195.0970
--------------------------------------------------
RMSE: 124092.1720