import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("train.csv") # importing the dataset
df.head() # checking the head
URLobject
URL_LENGTHint64
0
M0_109
16
1
B0_2314
16
2
B0_911
16
3
B0_113
17
4
B0_403
17
df.info() # checking the info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1781 entries, 0 to 1780
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 URL 1781 non-null object
1 URL_LENGTH 1781 non-null int64
2 NUMBER_SPECIAL_CHARACTERS 1781 non-null int64
3 CHARSET 1781 non-null object
4 SERVER 1780 non-null object
5 CONTENT_LENGTH 969 non-null float64
6 WHOIS_COUNTRY 1781 non-null object
7 WHOIS_STATEPRO 1781 non-null object
8 WHOIS_REGDATE 1781 non-null object
9 WHOIS_UPDATED_DATE 1781 non-null object
10 TCP_CONVERSATION_EXCHANGE 1781 non-null int64
11 DIST_REMOTE_TCP_PORT 1781 non-null int64
12 REMOTE_IPS 1781 non-null int64
13 APP_BYTES 1781 non-null int64
14 SOURCE_APP_PACKETS 1781 non-null int64
15 REMOTE_APP_PACKETS 1781 non-null int64
16 SOURCE_APP_BYTES 1781 non-null int64
17 REMOTE_APP_BYTES 1781 non-null int64
18 APP_PACKETS 1781 non-null int64
19 DNS_QUERY_TIMES 1780 non-null float64
20 Type 1781 non-null int64
dtypes: float64(2), int64(12), object(7)
memory usage: 292.3+ KB
df.isna().sum() # Checking the missing values per column
100 * df['Type'].value_counts()/len(df)
df.describe() # checking the descriptive stats for each numerical feature
URL_LENGTHfloat64
NUMBER_SPECIAL_CHARACTERSfloat64
count
1781
1781
mean
56.96125772
11.11173498
std
27.55558557
4.549895958
min
16
5
25%
39
8
50%
49
10
75%
68
13
max
249
43
plt.figure(figsize=(12,6))
sns.boxplot(data=df,x='Type',y='NUMBER_SPECIAL_CHARACTERS');
plt.figure(figsize=(12,6))
sns.boxplot(data=df,x='Type',y='URL_LENGTH');
for i in df.select_dtypes(include='object').columns:
print(f"{i} -> {df[i].nunique()}")
URL -> 1781
CHARSET -> 9
SERVER -> 239
WHOIS_COUNTRY -> 49
WHOIS_STATEPRO -> 182
WHOIS_REGDATE -> 891
WHOIS_UPDATED_DATE -> 594
df['CHARSET'].value_counts()
def CHARSET_CLEANER(x):
if x not in ['UTF-8','ISO-8859-1','utf-8','us-ascii','iso-8859-1']:
return "OTHERS"
else:
return x
df['CHARSET'] = df['CHARSET'].apply(CHARSET_CLEANER)
df['CHARSET'].value_counts()
df['SERVER'].value_counts()
def SERVER_CLEANER(x):
if x not in ['Apache','nginx','None','Microsoft-HTTPAPI/2.0','cloudflare-nginx']:
return "OTHERS"
else:
return x
df['SERVER'] = df['SERVER'].apply(SERVER_CLEANER)
df['SERVER'].value_counts()
df['WHOIS_STATEPRO'].value_counts()[:7]
def STATE_CLEANER(x):
if x not in ['CA','None','NY','WA','Barcelona','FL']:
return "OTHERS"
else:
return x
df['WHOIS_STATEPRO'] = df['WHOIS_STATEPRO'].apply(STATE_CLEANER)
df['WHOIS_STATEPRO'].value_counts()
def DATE_CLEANER(x):
if x == 'None':
return "Absent"
else:
return "Present"
df['WHOIS_REGDATE'] = df['WHOIS_REGDATE'].apply(DATE_CLEANER)
df['WHOIS_UPDATED_DATE'] = df['WHOIS_UPDATED_DATE'].apply(DATE_CLEANER)
df.head()
URLobject
URL_LENGTHint64
0
M0_109
16
1
B0_2314
16
2
B0_911
16
3
B0_113
17
4
B0_403
17
df.drop(['URL','WHOIS_COUNTRY'],axis=1,inplace=True)
df.head()
URL_LENGTHint64
NUMBER_SPECIAL_CHARACTERSint64
0
16
7
1
16
6
2
16
6
3
17
6
4
17
6
plt.figure(figsize=(20,10))
sns.heatmap(data=df.corr(),cmap='plasma',annot=True)
df2 = df.copy() # creating a copy of our dataframe
df2.drop("CONTENT_LENGTH",axis=1,inplace=True) # dropping the column which is not required
df3 = df2.copy() # creating a copy of the dataframe
df3 = pd.get_dummies(df3,columns=['WHOIS_UPDATED_DATE','WHOIS_REGDATE','WHOIS_STATEPRO','SERVER','CHARSET'],drop_first=True) # creating dummies
df3.head() # checking the head
URL_LENGTHint64
NUMBER_SPECIAL_CHARACTERSint64
0
16
7
1
16
6
2
16
6
3
17
6
4
17
6
df3.isna().sum() # checking for any missing value
df3.dropna(inplace=True) # dropping all the missing values
!pip install imblearn
Requirement already satisfied: imblearn in /root/venv/lib/python3.7/site-packages (0.0)
Requirement already satisfied: imbalanced-learn in /root/venv/lib/python3.7/site-packages (from imblearn) (0.9.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (3.1.0)
Requirement already satisfied: numpy>=1.14.6 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (1.19.5)
Requirement already satisfied: joblib>=0.11 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (1.1.0)
Requirement already satisfied: scipy>=1.1.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (1.7.3)
Requirement already satisfied: scikit-learn>=1.0.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (1.0.2)
WARNING: You are using pip version 20.1.1; however, version 22.0.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
# Importing the SMOTE function
from imblearn.over_sampling import SMOTE
# Creating the set of independent features and target variable
X = df3.drop("Type",axis=1)
y = df3['Type']
from imblearn.under_sampling import RandomUnderSampler # importing the Under Sampling function
# We shall keep undersampled majority class 50% more than the oversampled minority class.
# This is being done on order to resemble the composition of original dataframe in the SMOTE's dataframe
undersample = RandomUnderSampler(sampling_strategy=0.5)
from imblearn.pipeline import Pipeline # Importing the pipeline
# Initializing the SMOTE function. We set our SMOTE function to oversample the minority to the number equal to the majority class.
#Then, we take 50% of the oversampled minority class (randomly sampled).
oversample = SMOTE(sampling_strategy=0.5)
steps = [('o',oversample),('u',undersample)] # steps for pipelining. First "do oversampling of the minority class" and then do "undersampling of the majority class"
pipeline = Pipeline(steps=steps) # Creating the pipeline instance
X_smote, y_smote = pipeline.fit_resample(X,y) # Fitting the pipeline to our dataset
y_smote.value_counts() # Taking value counts of the targte feature
len(X_smote) # checking the total number of samples we have
X_smote.shape # checking the shape
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,cross_validate # Implementing the required functions
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # creating a test holdout set
from sklearn.preprocessing import StandardScaler # import the standard scaling function
sc = StandardScaler() # creating an instance of the scaling function
X_train = sc.fit_transform(X_train) # fitting and transform the training set
X_test = sc.transform(X_test) # just transforming the testing set to avoid 'data leakage'
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score,fbeta_score,make_scorer,precision_score,recall_score
# importing all the metric scores required for evaluation
# creating a dictionary to evaluate metric over stratified k-fold cv
scoring = {'accuracy' : make_scorer(accuracy_score),
'precision' : make_scorer(precision_score),
'recall' : make_scorer(recall_score),
'f1_score' : make_scorer(f1_score)}
!pip install sklearn
Requirement already satisfied: sklearn in /root/venv/lib/python3.7/site-packages (0.0)
Requirement already satisfied: scikit-learn in /shared-libs/python3.7/py/lib/python3.7/site-packages (from sklearn) (1.0.2)
Requirement already satisfied: numpy>=1.14.6 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->sklearn) (1.19.5)
Requirement already satisfied: scipy>=1.1.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->sklearn) (1.7.3)
Requirement already satisfied: joblib>=0.11 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->sklearn) (1.1.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->sklearn) (3.1.0)
WARNING: You are using pip version 20.1.1; however, version 22.0.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
from sklearn.ensemble import RandomForestClassifier # importing the function
rf = RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=42,class_weight={0:1,1:5},max_depth=5) # creating an instance
rf.fit(X_train,y_train) # fitting the model
rf_cv_f1 = cross_validate(rf,X_test,y_test,cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=111),scoring=scoring)
# cross-validating our model over 5 folds and evaluting metrics are: accuracy, precision, recall and F-1 score
print(f" ACCURACY: {rf_cv_f1['test_accuracy'].mean()}")
print(f" PRECISION: {rf_cv_f1['test_precision'].mean()}")
print(f" RECALL: {rf_cv_f1['test_recall'].mean()}")
print(f" F-1 Score: {rf_cv_f1['test_f1_score'].mean()}")
ACCURACY: 0.8958528951486698
PRECISION: 0.6437762237762238
RECALL: 0.7872727272727272
F-1 Score: 0.695487012987013
rf_pred = rf.predict(X_test) # predicting on the hold out test set
print(classification_report(y_test,rf_pred))
print(confusion_matrix(y_test,rf_pred))
precision recall f1-score support
0 0.98 0.93 0.95 304
1 0.69 0.87 0.77 52
accuracy 0.92 356
macro avg 0.83 0.90 0.86 356
weighted avg 0.93 0.92 0.93 356
[[284 20]
[ 7 45]]
!pip install catboost
Requirement already satisfied: catboost in /root/venv/lib/python3.7/site-packages (1.0.4)
Requirement already satisfied: matplotlib in /shared-libs/python3.7/py/lib/python3.7/site-packages (from catboost) (3.5.1)
Requirement already satisfied: plotly in /shared-libs/python3.7/py/lib/python3.7/site-packages (from catboost) (5.6.0)
Requirement already satisfied: numpy>=1.16.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from catboost) (1.19.5)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from catboost) (1.16.0)
Requirement already satisfied: scipy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from catboost) (1.7.3)
Requirement already satisfied: graphviz in /root/venv/lib/python3.7/site-packages (from catboost) (0.19.1)
Requirement already satisfied: pandas>=0.24.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from catboost) (1.2.5)
Requirement already satisfied: cycler>=0.10 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->catboost) (0.11.0)
Requirement already satisfied: pyparsing>=2.2.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib->catboost) (3.0.7)
Requirement already satisfied: pillow>=6.2.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->catboost) (9.0.1)
Requirement already satisfied: packaging>=20.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib->catboost) (21.3)
Requirement already satisfied: python-dateutil>=2.7 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib->catboost) (2.8.2)
Requirement already satisfied: kiwisolver>=1.0.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->catboost) (1.3.2)
Requirement already satisfied: fonttools>=4.22.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->catboost) (4.29.1)
Requirement already satisfied: tenacity>=6.2.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from plotly->catboost) (8.0.1)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.24.0->catboost) (2021.3)
WARNING: You are using pip version 20.1.1; however, version 22.0.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
from catboost import CatBoostClassifier # importing the function
cb = CatBoostClassifier(random_state=42,verbose=500,class_weights={0:1,1:5},max_depth=5,early_stopping_rounds=30,boosting_type='Ordered') # creating an instance
cb.fit(X_train,y_train) # fitting the model
Learning rate set to 0.011981
0: learn: 0.6756274 total: 87.5ms remaining: 1m 27s
500: learn: 0.1157566 total: 8.33s remaining: 8.29s
999: learn: 0.0604226 total: 16.5s remaining: 0us
cb_cv_f1 = cross_validate(cb,X_test,y_test,cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=42),scoring=scoring)
# cross-validating our model over 5 folds and evaluting metrics are: accuracy, precision, recall and F-1 score
Learning rate set to 0.006019
0: learn: 0.6857393 total: 16.8ms remaining: 16.8s
500: learn: 0.1806446 total: 6.67s remaining: 6.64s
999: learn: 0.1018346 total: 13s remaining: 0us
Learning rate set to 0.006028
0: learn: 0.6866775 total: 15.1ms remaining: 15.1s
500: learn: 0.1822175 total: 7.21s remaining: 7.18s
999: learn: 0.1087319 total: 14.1s remaining: 0us
Learning rate set to 0.006028
0: learn: 0.6867839 total: 15.7ms remaining: 15.7s
500: learn: 0.1680272 total: 7.19s remaining: 7.16s
999: learn: 0.0988092 total: 14s remaining: 0us
Learning rate set to 0.006028
0: learn: 0.6867894 total: 27.6ms remaining: 27.6s
500: learn: 0.1909167 total: 7.13s remaining: 7.1s
999: learn: 0.1079882 total: 14.1s remaining: 0us
Learning rate set to 0.006028
0: learn: 0.6872671 total: 18.6ms remaining: 18.5s
500: learn: 0.1850676 total: 6.91s remaining: 6.88s
999: learn: 0.1092527 total: 14.1s remaining: 0us
print(f" ACCURACY: {cb_cv_f1['test_accuracy'].mean()}")
print(f" PRECISION: {cb_cv_f1['test_precision'].mean()}")
print(f" RECALL: {cb_cv_f1['test_recall'].mean()}")
print(f" F-1 Score: {cb_cv_f1['test_f1_score'].mean()}")
ACCURACY: 0.901643192488263
PRECISION: 0.636451969083548
RECALL: 0.8272727272727272
F-1 Score: 0.7126429512516469
cb_pred = cb.predict(X_test) # predicting on the hold out test set
print(classification_report(y_test,cb_pred))
print(confusion_matrix(y_test,cb_pred))
print(precision_score(y_test,cb_pred))
print(recall_score(y_test,cb_pred))
print(f1_score(y_test,cb_pred))
precision recall f1-score support
0 0.98 0.96 0.97 304
1 0.79 0.87 0.83 52
accuracy 0.95 356
macro avg 0.88 0.91 0.90 356
weighted avg 0.95 0.95 0.95 356
[[292 12]
[ 7 45]]
0.7894736842105263
0.8653846153846154
0.8256880733944955
def correlation(dataset,threshold):
col_corr = set() # empty set to avoid repittion later
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if (corr_matrix.iloc[i,j]) > threshold: # abs is taken to consider highly negatively correlated columns as well
colname = corr_matrix.columns[i] # getting the name of the column
col_corr.add(colname)
return col_corr
correlation(X_smote,0.7) # all those columns which ahve more than 70% collinearity
X_smote2 = X_smote.drop(list(correlation(X_smote,0.7)),axis=1) # removing all those columns which ahve more than 70% collinearity
from sklearn.feature_selection import mutual_info_classif
# determine the mutual information
mutual_info = mutual_info_classif(X_smote2, y_smote)
mutual_info
mutual_info = pd.Series(mutual_info) # Creating series of column names and their respective mutual information gain
mutual_info.index = X_smote2.columns # setting up index
mutual_info.sort_values(ascending=False) # sorting the values
# Bar Plot of Mutual Information Gain with respect to our target variable
plt.ylabel("Mutual Information Gain")
plt.xlabel("Independent Features")
plt.title("Mutual Information Gain of each feature with respect to target variable")
mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 10));