!pip install lightgbm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from contextlib import contextmanager
from time import time
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
sns.set_style('darkgrid')
warnings.filterwarnings('ignore')
''' reading dataset '''
df = pd.read_csv('Placement_Data_Full_Class.csv')
''' displaying first 5 rows '''
df.head()
''' shape of data '''
df.shape
''' checking null values '''
df.isnull().sum()
''' checking info of data '''
df.info()
df.head()
''' count of gender '''
gender_label = df.gender.value_counts()
''' plotting count '''
plt.figure(figsize=(10, 5))
sns.barplot(gender_label.index, gender_label);
plt.xlabel('gender', fontsize=20)
plt.ylabel('count', fontsize=20)
plt.title('gender count', fontsize=20);
''' count of degrees '''
df.degree_t.value_counts()
''' count of specialiation '''
df.specialisation.value_counts()
''' count of status '''
df.status.value_counts()
''' chekcing status '''
plt.figure(figsize=(8,5))
sns.countplot(x='status', hue='gender', data=df);
df_ = df
for column in df_.columns:
if df_[column].dtype != 'object':
''' filling null values with 0 '''
df_[column] = df_[column].fillna(0)
''' transpose df_ '''
df_[0:2].T
for column in df_.columns:
if df_[column].dtype=='object':
''' filling null values in N '''
df_[column] = df_[column].fillna('N')
''' label encoder '''
lbl_enc = LabelEncoder()
lbl_enc.fit(list(df_[column].values))
df_[column] = lbl_enc.transform(df_[column].values)
df1 = df_
''' transpose df1 '''
df1[0:2].T
''' dependent variable and independent variables '''
y_df1 = df1[['status', 'salary']]
''' dropping status and salary from df1 '''
X_df1 = df1.drop(['status', 'salary'], axis=1)
''' transpose y_df1 and X_df1 '''
print(y_df1[0:2].T)
print("-" * 100)
print(X_df1[0:2].T)
''' converting it into numpy array '''
X_train = np.array(X_df1)
y_train = np.array(y_df1)
''' shape of X_train and y_train'''
print(X_train.shape)
print(y_train.shape)
columns = list(X_df1.columns)
columns
''' converting X_train into DataFrame '''
X_train = pd.DataFrame(X_train, columns=columns)
''' displaying X_train data '''
X_train.head()
''' creating a function create_nf '''
def create_nf(inp_df):
use_cols = columns
return inp_df[use_cols].copy()
''' creating class Timer '''
class Timer:
''' constructor '''
def __init__(self, logger=None, frmt_str='{:.3f}[s]', prefix=None, suffix=None, sep=' '):
if prefix: frmt_str = str(prefix) + sep + frmt_str
if suffix: frmt_str = frmt_str + sep + str(suffix)
''' instance variables '''
self.frmt_str = frmt_str
self.logger = logger
self.start = None
self.end = None
@property
def duration(self):
if self.end is None:
return 0
return self.end - self.start
def __enter__(self):
self.start = time()
def __exit__(self, exc_type, exc_val, exc_tb):
self.end = time()
out_str = self.frmt_str.format(self.duration)
if self.logger:
self.logger.info(out_str)
else:
print(out_str)
def to_feature(inp_df):
p = [create_nf,]
'''creating dataframe'''
df = pd.DataFrame()
for func in tqdm(p, total=len(p)):
with Timer(prefix='create' + func.__name__ + ' '):
df1 = func(inp_df)
assert len(df1) == len(inp_df), func.__name__
output_df = pd.concat([df, df1], axis=1)
return output_df
X_train_result = to_feature(X_train)
''' creating fit function '''
def fit(X, y, cv, params: dict=None, verbose: int=50):
if params is None:
params = {}
m = []
ypred = np.zeros_like(y, dtype=np.float)
for i, (t_idx, v_idx) in enumerate(cv):
X_train, y_train = X[t_idx], y[t_idx]
X_test, y_test = X[v_idx], y[v_idx]
''' model '''
lg = lgbm.LGBMRegressor(**params)
with Timer(prefix='fit fold={} '.format(i)):
''' training '''
lg.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=verbose)
''' prediction '''
pred = lg.predict(X_test)
ypred[v_idx] = pred
m.append(lg)
print(f'Fold {i} RMSLE: {mean_squared_error(y_test, pred) ** .5:.4f}')
print()
result_score = mean_squared_error(y, ypred) ** .5
print('-' * 50)
print('RMSE: {:.4f}'.format(result_score))
return ypred, m
p = {
'objective': 'rmse',
'learning_rate': .1,
'reg_lambda': 1.,
'reg_alpha': .1,
'max_depth': 5,
'n_estimators': 10000,
'colsample_bytree': .5,
'min_child_samples': 10,
'subsample_freq': 3,
'subsample': .9,
'importance_type': 'gain',
'random_state': 71,
'num_leaves': 62
}
''' creating dataframe of y_train '''
y_train = pd.DataFrame(y_train)
y_train.head()
target = ['status', 'salary']
for i in range(2):
''' cross-validation '''
fold = KFold(n_splits=5, shuffle=True, random_state=71)
y_train_i = y_train.iloc[:,i]
''' converting into numpy array '''
y = np.array(y_train_i)
cv = list(fold.split(X_train_result, y))
ypred, models = fit(X_train_result.values, y, cv, params=p, verbose=500)
fig,ax = plt.subplots(figsize=(6,6))
ax.set_title(target[i],fontsize=20)
ax.set_xlabel('oof '+str(i),fontsize=12)
ax.set_ylabel('train_y '+str(i),fontsize=12)
ax.scatter(ypred,y)