Importing Libraries
!pip install imbalanced-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
!pip install imblearn==0.0
np.random.seed(42)
Importing datasets
train = pd.read_csv('/work/umojahack-africa-2022-beginner-challenge/train.csv', parse_dates=['Datetime'])
test = pd.read_csv('/work/umojahack-africa-2022-beginner-challenge/test.csv', parse_dates=['Datetime'])
SampleSubmission = pd.read_csv('/work/umojahack-africa-2022-beginner-challenge/SampleSubmission.csv')
train.head()
test.head()
train.shape, test.shape, SampleSubmission.shape
train.info()
Data Cleaning
train = train.drop(["ID"], axis = 1)
train['Datetime_day'] = train.Datetime.dt.day
# month
train['Datetime_month'] = train.Datetime.dt.month
# year
train['Datetime_year'] = train.Datetime.dt.year
# hour
train['Datetime_hour'] = train.Datetime.dt.hour
# Preview engineered date features
train[['Datetime', 'Datetime_day', 'Datetime_month', 'Datetime_year', 'Datetime_hour']].head()
train.head()
train.isnull().sum()
ax = train.isna().sum().sort_values().plot(kind = 'barh', figsize = (9, 10))
plt.title('Percentage of Missing Values Per Column in Train Set', fontdict={'size':15})
for p in ax.patches:
percentage ='{:,.0f}%'.format((p.get_width()/train.shape[0])*100)
width, height =p.get_width(),p.get_height()
x=p.get_x()+width+0.02
y=p.get_y()+height/2
ax.annotate(percentage,(x,y))
train_c1 = train.copy()
train.shape, train_c1.shape
imputer = KNNImputer(n_neighbors=5)
imputer.fit(train_c1.iloc[:,1:3])
train_c1.iloc[:,1:3]= imputer.transform(train_c1.iloc[:,1:3])
train_c1["Temperature"] = train_c1.groupby(["Datetime_month","Datetime_hour"])["Temperature"].apply(lambda x:x.fillna(x.median()))
train_c1["Temperature"] = train_c1["Temperature"].apply(np.ceil)
train_c1["Relative_Humidity"] = train_c1.groupby(["Datetime_month","Datetime_hour"])["Relative_Humidity"].apply(lambda x:x.fillna(x.median()))
train_c1["Relative_Humidity"] = train_c1["Relative_Humidity"].apply(np.ceil)
train_c1.isnull().sum()
train.isnull().sum()
EDA
General Statistics
train.describe(include='all')
train.Offset_fault.value_counts()
fig = px.histogram(data_frame=train, x='Offset_fault')
fig.show()
Exploring Outliers
sns.set_style('darkgrid')
fig, axes = plt.subplots(nrows = 2, ncols = 2, figsize = (15, 10))
fig.suptitle('Outliers', y= 0.93, fontsize = 15)
for ax, data, name in zip(axes.flatten(), train_c1, ['Sensor1_PM2.5', 'Sensor2_PM2.5', 'Temperature', 'Relative_Humidity']):
sns.violinplot(train_c1[name], ax = ax)
sns.set_style('darkgrid')
fig, axes = plt.subplots(nrows = 2, ncols = 2, figsize = (15, 10))
fig.suptitle('Outliers', y= 0.93, fontsize = 15)
for ax, data, name in zip(axes.flatten(), train_c1, ['Sensor1_PM2.5', 'Sensor2_PM2.5', 'Temperature', 'Relative_Humidity']):
sns.boxplot(train_c1[name], ax = ax)
sns.set_style('darkgrid')
fig, axes = plt.subplots(nrows = 2, ncols = 2, figsize = (20, 10))
fig.suptitle('Outliers', y= 0.93, fontsize = 20)
for ax, data, name in zip(axes.flatten(), train_c1, ['Sensor1_PM2.5', 'Sensor2_PM2.5', 'Temperature', 'Relative_Humidity']):
sns.violinplot(x='Offset_fault', y=name, data=train_c1, ax = ax)
Data Distributions & Correlations
plt.figure(figsize=(16,8))
sns.scatterplot(x=train_c1['Sensor1_PM2.5'],y=train_c1['Sensor2_PM2.5'], hue=train_c1.Offset_fault)
plt.figure(figsize = (20, 12))
num_cols = ['Sensor1_PM2.5', 'Sensor2_PM2.5', 'Temperature', 'Relative_Humidity']
sns.pairplot(train_c1[num_cols], kind="scatter", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()
corr = train_c1.corr()
plt.figure(figsize = (13, 8))
sns.heatmap(corr, cmap='RdYlGn', annot = True, center = 0)
plt.title('Correlogram', fontsize = 15, color = 'darkgreen')
plt.show()
Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline
scaler = StandardScaler()
power = PowerTransformer(method='yeo-johnson')
train_c1.iloc[:,1:5] = scaler.fit_transform(train_c1.iloc[:,1:5])
train_c1.iloc[:,1:5] = power.fit_transform(train_c1.iloc[:,1:5])
Test_X = test.drop('ID', axis='columns')
ID = test.iloc[:,0]
Test_X['Datetime_day'] = Test_X.Datetime.dt.day
# month
Test_X['Datetime_month'] = Test_X.Datetime.dt.month
# year
Test_X['Datetime_year'] = Test_X.Datetime.dt.year
# hour
Test_X['Datetime_hour'] = Test_X.Datetime.dt.hour
Test_X = Test_X.iloc[:,1:]
Test_X.head()
Test_X.isnull().sum()
imputer = KNNImputer(n_neighbors=5)
imputer.fit(Test_X.iloc[:,:2])
Test_X.iloc[:,:2]= imputer.transform(Test_X.iloc[:,:2])
Test_X["Temperature"] = Test_X.groupby(["Datetime_month","Datetime_hour"])["Temperature"].apply(lambda x:x.fillna(x.median()))
Test_X["Temperature"] = Test_X["Temperature"].apply(np.ceil)
Test_X["Relative_Humidity"] = Test_X.groupby(["Datetime_month","Datetime_hour"])["Relative_Humidity"].apply(lambda x:x.fillna(x.median()))
Test_X["Relative_Humidity"] = Test_X["Relative_Humidity"].apply(np.ceil)
imputer = KNNImputer(n_neighbors=5)
imputer.fit(Test_X.iloc[:,2:4])
Test_X.iloc[:,2:4]= imputer.transform(Test_X.iloc[:,2:4])
Test_X = Test_X.iloc[:,:2]
Test_X.head()
scaler1 = StandardScaler()
power1 = PowerTransformer(method='yeo-johnson')
Test_X = scaler1.fit_transform(Test_X)
Test_X = power1.fit_transform(Test_X)
train_c1=train_c1.sample(frac=1)
Offset_fault1 = train_c1.loc[train_c1["Offset_fault"]==1]
nbr_Offset_fault1 = len(Offset_fault1)
Offset_fault0 = train_c1.loc[train_c1["Offset_fault"]==0][:nbr_Offset_fault1]
normal_distributed_train = pd.concat([Offset_fault1,Offset_fault0])
new_train = normal_distributed_train.sample(frac=1 , random_state=42)
X = new_train.drop('Offset_fault', axis='columns')
X = X.iloc[:,1:3]
y = new_train.iloc[:,5]
X.head()
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2, stratify=y)
len(X_train)
Modeling & Testing
from sklearn.svm import SVC
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)
classifier.score(X_train,y_train)
classifier.score(X_test,y_test)
import pickle
classifier_name = 'classifier.sav'
pickle.dump(classifier, open(classifier_name, 'wb'))
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1e5)
clf.fit(X, y)
clf.score(X_train,y_train)
clf.score(X_test,y_test)
clf_name = 'clf.sav'
pickle.dump(clf, open(clf_name, 'wb'))
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth = 10)
tree_clf.fit(X_train,y_train)
tree_clf.score(X_train,y_train)
tree_clf.score(X_test,y_test)
tree_clf_name = 'tree_clf.sav'
pickle.dump(tree_clf, open(tree_clf_name, 'wb'))
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
for_clf = RandomForestClassifier()
for_clf.fit(X_train,y_train)
for_clf.score(X_train,y_train)
for_clf.score(X_test,y_test)
for_clf_name = 'for_clf.sav'
pickle.dump(for_clf, open(for_clf_name, 'wb'))
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
neigh = RadiusNeighborsClassifier(radius=1.0)
neigh.fit(X_train,y_train)
neigh_name = 'neigh.sav'
pickle.dump(neigh, open(neigh_name, 'wb'))
gra_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01,
max_depth=1, random_state=0)
gra_clf.fit(X_train, y_train)
gra_clf_name = 'gra_clf.sav'
pickle.dump(gra_clf, open(gra_clf_name, 'wb'))
sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3)
sgd_clf.fit(X_train, y_train)
sgd_clf_name = 'sgd_clf.sav'
pickle.dump(sgd_clf, open(sgd_clf_name, 'wb'))
MLP_clf = MLPClassifier(random_state=1, max_iter=300)
MLP_clf.fit(X_train, y_train)
MLP_clf_name = 'MLP_clf.sav'
pickle.dump(MLP_clf, open(MLP_clf_name, 'wb'))
MLP_clf.score(X_train, y_train)
MLP_clf.score(X_test,y_test)
gra_clf.score(X_train, y_train)
gra_clf.score(X_test,y_test)
sgd_clf.score(X_train, y_train)
sgd_clf.score(X_test,y_test)
k_neigh = KNeighborsClassifier(n_neighbors=3)
k_neigh.fit(X_train,y_train)
k_neigh_name = 'k_neigh.sav'
pickle.dump(k_neigh, open(k_neigh_name, 'wb'))
k_neigh.score(X_train,y_train)
k_neigh.score(X_test,y_test)
X_test.isnull().sum()
classifier_Test = pd.DataFrame()
classifier_Test['ID'] = ID
classifier_Test['Offset_fault'] = len(ID)*[0]
L = []
L.append(list(Test_X[0]))
L
for i in range(len(Test_X)):
L = []
L.append(list(Test_X[i]))
classifier_Test['Offset_fault'].iloc[i] = classifier.predict(L)
classifier_Test.to_csv('classifier.csv', index=False)
clf_Test = pd.DataFrame()
clf_Test['ID'] = ID
clf_Test['Offset_fault'] = len(ID)*[0]
for i in range(len(Test_X)):
L = []
L.append(list(Test_X[i]))
clf_Test['Offset_fault'].iloc[i] = clf.predict(L)
clf_Test.to_csv('clf.csv', index=False)
tree_clf_Test = pd.DataFrame()
tree_clf_Test['ID'] = ID
tree_clf_Test['Offset_fault'] = len(ID)*[0]
for i in range(len(Test_X)):
L = []
L.append(list(Test_X[i]))
tree_clf_Test['Offset_fault'].iloc[i] = tree_clf.predict(L)
tree_clf_Test.to_csv('tree_clf.csv', index=False)
for_clf_Test = pd.DataFrame()
for_clf_Test['ID'] = ID
for_clf_Test['Offset_fault'] = len(ID)*[0]
for i in range(len(Test_X)):
L = []
L.append(list(Test_X[i]))
for_clf_Test['Offset_fault'].iloc[i] = for_clf.predict(L)
for_clf_Test.to_csv('for_clf.csv', index=False)
MLP_clf_Test = pd.DataFrame()
MLP_clf_Test['ID'] = ID
MLP_clf_Test['Offset_fault'] = len(ID)*[0]
for i in range(len(Test_X)):
L = []
L.append(list(Test_X[i]))
MLP_clf_Test['Offset_fault'].iloc[i] = MLP_clf.predict(L)
MLP_clf_Test.to_csv('MLP_clf.csv', index=False)
sgd_clf_Test = pd.DataFrame()
sgd_clf_Test['ID'] = ID
sgd_clf_Test['Offset_fault'] = len(ID)*[0]
for i in range(len(Test_X)):
L = []
L.append(list(Test_X[i]))
sgd_clf_Test['Offset_fault'].iloc[i] = sgd_clf.predict(L)
sgd_clf_Test.to_csv('sgd_clf.csv', index=False)
from sklearn.ensemble import VotingClassifier
k_neigh_Test = pd.DataFrame()
k_neigh_Test['ID'] = ID
k_neigh_Test['Offset_fault'] = len(ID)*[0]
for i in range(len(Test_X)):
L = []
L.append(list(Test_X[i]))
k_neigh_Test['Offset_fault'].iloc[i] = k_neigh.predict(L)
k_neigh_Test.to_csv('k_neigh.csv', index=False)
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
cross_val_score(XGBClassifier(), X_train, y_train)
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb.score(X_train, y_train)
xgb.score(X_test, y_test)
X_train.iloc[0]
X_test.iloc[0]
Test_X[0]
from sklearn.ensemble import BaggingClassifier
for_bag_clf = BaggingClassifier(RandomForestClassifier(n_estimators=100),n_estimators=100)
for_bag_clf.fit(X_train, y_train)
for_bag_clf.score(X_train, y_train)
for_bag_clf.score(X_test, y_test)