Website Classification - Malicious vs. Benign

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv("train.csv") # importing the dataset

df.head() # checking the head

URLobject

URL_LENGTHint64

M0_109

B0_2314

B0_911

B0_113

B0_403

df.info() # checking the info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1781 entries, 0 to 1780
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   URL                        1781 non-null   object 
 1   URL_LENGTH                 1781 non-null   int64  
 2   NUMBER_SPECIAL_CHARACTERS  1781 non-null   int64  
 3   CHARSET                    1781 non-null   object 
 4   SERVER                     1780 non-null   object 
 5   CONTENT_LENGTH             969 non-null    float64
 6   WHOIS_COUNTRY              1781 non-null   object 
 7   WHOIS_STATEPRO             1781 non-null   object 
 8   WHOIS_REGDATE              1781 non-null   object 
 9   WHOIS_UPDATED_DATE         1781 non-null   object 
 10  TCP_CONVERSATION_EXCHANGE  1781 non-null   int64  
 11  DIST_REMOTE_TCP_PORT       1781 non-null   int64  
 12  REMOTE_IPS                 1781 non-null   int64  
 13  APP_BYTES                  1781 non-null   int64  
 14  SOURCE_APP_PACKETS         1781 non-null   int64  
 15  REMOTE_APP_PACKETS         1781 non-null   int64  
 16  SOURCE_APP_BYTES           1781 non-null   int64  
 17  REMOTE_APP_BYTES           1781 non-null   int64  
 18  APP_PACKETS                1781 non-null   int64  
 19  DNS_QUERY_TIMES            1780 non-null   float64
 20  Type                       1781 non-null   int64  
dtypes: float64(2), int64(12), object(7)
memory usage: 292.3+ KB

df.isna().sum() # Checking the missing values per column

100 * df['Type'].value_counts()/len(df)

df.describe() # checking the descriptive stats for each numerical feature

URL_LENGTHfloat64

NUMBER_SPECIAL_CHARACTERSfloat64

count

1781

mean

56.96125772

11.11173498

std

27.55558557

4.549895958

min

25%

50%

75%

max

249

plt.figure(figsize=(12,6)) sns.boxplot(data=df,x='Type',y='NUMBER_SPECIAL_CHARACTERS');

plt.figure(figsize=(12,6)) sns.boxplot(data=df,x='Type',y='URL_LENGTH');

for i in df.select_dtypes(include='object').columns: print(f"{i} -> {df[i].nunique()}")

URL -> 1781
CHARSET -> 9
SERVER -> 239
WHOIS_COUNTRY -> 49
WHOIS_STATEPRO -> 182
WHOIS_REGDATE -> 891
WHOIS_UPDATED_DATE -> 594

df['CHARSET'].value_counts()

def CHARSET_CLEANER(x): if x not in ['UTF-8','ISO-8859-1','utf-8','us-ascii','iso-8859-1']: return "OTHERS" else: return x

df['CHARSET'] = df['CHARSET'].apply(CHARSET_CLEANER)

df['CHARSET'].value_counts()

df['SERVER'].value_counts()

def SERVER_CLEANER(x): if x not in ['Apache','nginx','None','Microsoft-HTTPAPI/2.0','cloudflare-nginx']: return "OTHERS" else: return x

df['SERVER'] = df['SERVER'].apply(SERVER_CLEANER)

df['SERVER'].value_counts()

df['WHOIS_STATEPRO'].value_counts()[:7]

def STATE_CLEANER(x): if x not in ['CA','None','NY','WA','Barcelona','FL']: return "OTHERS" else: return x

df['WHOIS_STATEPRO'] = df['WHOIS_STATEPRO'].apply(STATE_CLEANER)

df['WHOIS_STATEPRO'].value_counts()

def DATE_CLEANER(x): if x == 'None': return "Absent" else: return "Present"

df['WHOIS_REGDATE'] = df['WHOIS_REGDATE'].apply(DATE_CLEANER)

df['WHOIS_UPDATED_DATE'] = df['WHOIS_UPDATED_DATE'].apply(DATE_CLEANER)

df.head()

URLobject

URL_LENGTHint64

M0_109

B0_2314

B0_911

B0_113

B0_403

df.drop(['URL','WHOIS_COUNTRY'],axis=1,inplace=True)

df.head()

URL_LENGTHint64

NUMBER_SPECIAL_CHARACTERSint64

plt.figure(figsize=(20,10)) sns.heatmap(data=df.corr(),cmap='plasma',annot=True)

df2 = df.copy() # creating a copy of our dataframe

df2.drop("CONTENT_LENGTH",axis=1,inplace=True) # dropping the column which is not required

df3 = df2.copy() # creating a copy of the dataframe

df3 = pd.get_dummies(df3,columns=['WHOIS_UPDATED_DATE','WHOIS_REGDATE','WHOIS_STATEPRO','SERVER','CHARSET'],drop_first=True) # creating dummies

df3.head() # checking the head

URL_LENGTHint64

NUMBER_SPECIAL_CHARACTERSint64

df3.isna().sum() # checking for any missing value

df3.dropna(inplace=True) # dropping all the missing values

!pip install imblearn

Requirement already satisfied: imblearn in /root/venv/lib/python3.7/site-packages (0.0)
Requirement already satisfied: imbalanced-learn in /root/venv/lib/python3.7/site-packages (from imblearn) (0.9.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (3.1.0)
Requirement already satisfied: numpy>=1.14.6 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (1.19.5)
Requirement already satisfied: joblib>=0.11 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (1.1.0)
Requirement already satisfied: scipy>=1.1.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (1.7.3)
Requirement already satisfied: scikit-learn>=1.0.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (1.0.2)
WARNING: You are using pip version 20.1.1; however, version 22.0.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.

# Importing the SMOTE function from imblearn.over_sampling import SMOTE

# Creating the set of independent features and target variable X = df3.drop("Type",axis=1) y = df3['Type']

from imblearn.under_sampling import RandomUnderSampler # importing the Under Sampling function

# We shall keep undersampled majority class 50% more than the oversampled minority class. # This is being done on order to resemble the composition of original dataframe in the SMOTE's dataframe undersample = RandomUnderSampler(sampling_strategy=0.5)

from imblearn.pipeline import Pipeline # Importing the pipeline

# Initializing the SMOTE function. We set our SMOTE function to oversample the minority to the number equal to the majority class. #Then, we take 50% of the oversampled minority class (randomly sampled). oversample = SMOTE(sampling_strategy=0.5)

steps = [('o',oversample),('u',undersample)] # steps for pipelining. First "do oversampling of the minority class" and then do "undersampling of the majority class"

pipeline = Pipeline(steps=steps) # Creating the pipeline instance

X_smote, y_smote = pipeline.fit_resample(X,y) # Fitting the pipeline to our dataset

y_smote.value_counts() # Taking value counts of the targte feature

len(X_smote) # checking the total number of samples we have

X_smote.shape # checking the shape

from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,cross_validate # Implementing the required functions

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # creating a test holdout set

from sklearn.preprocessing import StandardScaler # import the standard scaling function

sc = StandardScaler() # creating an instance of the scaling function

X_train = sc.fit_transform(X_train) # fitting and transform the training set X_test = sc.transform(X_test) # just transforming the testing set to avoid 'data leakage'

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score,fbeta_score,make_scorer,precision_score,recall_score # importing all the metric scores required for evaluation

# creating a dictionary to evaluate metric over stratified k-fold cv scoring = {'accuracy' : make_scorer(accuracy_score), 'precision' : make_scorer(precision_score), 'recall' : make_scorer(recall_score), 'f1_score' : make_scorer(f1_score)}

!pip install sklearn

Requirement already satisfied: sklearn in /root/venv/lib/python3.7/site-packages (0.0)
Requirement already satisfied: scikit-learn in /shared-libs/python3.7/py/lib/python3.7/site-packages (from sklearn) (1.0.2)
Requirement already satisfied: numpy>=1.14.6 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->sklearn) (1.19.5)
Requirement already satisfied: scipy>=1.1.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->sklearn) (1.7.3)
Requirement already satisfied: joblib>=0.11 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->sklearn) (1.1.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn->sklearn) (3.1.0)
WARNING: You are using pip version 20.1.1; however, version 22.0.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.

from sklearn.ensemble import RandomForestClassifier # importing the function

rf = RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=42,class_weight={0:1,1:5},max_depth=5) # creating an instance

rf.fit(X_train,y_train) # fitting the model

rf_cv_f1 = cross_validate(rf,X_test,y_test,cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=111),scoring=scoring) # cross-validating our model over 5 folds and evaluting metrics are: accuracy, precision, recall and F-1 score

print(f" ACCURACY: {rf_cv_f1['test_accuracy'].mean()}") print(f" PRECISION: {rf_cv_f1['test_precision'].mean()}") print(f" RECALL: {rf_cv_f1['test_recall'].mean()}") print(f" F-1 Score: {rf_cv_f1['test_f1_score'].mean()}")

 ACCURACY: 0.8958528951486698
 PRECISION: 0.6437762237762238
 RECALL: 0.7872727272727272
 F-1 Score: 0.695487012987013

rf_pred = rf.predict(X_test) # predicting on the hold out test set

print(classification_report(y_test,rf_pred)) print(confusion_matrix(y_test,rf_pred))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95       304
           1       0.69      0.87      0.77        52

    accuracy                           0.92       356
   macro avg       0.83      0.90      0.86       356
weighted avg       0.93      0.92      0.93       356

[[284  20]
 [  7  45]]

!pip install catboost

Requirement already satisfied: catboost in /root/venv/lib/python3.7/site-packages (1.0.4)
Requirement already satisfied: matplotlib in /shared-libs/python3.7/py/lib/python3.7/site-packages (from catboost) (3.5.1)
Requirement already satisfied: plotly in /shared-libs/python3.7/py/lib/python3.7/site-packages (from catboost) (5.6.0)
Requirement already satisfied: numpy>=1.16.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from catboost) (1.19.5)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from catboost) (1.16.0)
Requirement already satisfied: scipy in /shared-libs/python3.7/py/lib/python3.7/site-packages (from catboost) (1.7.3)
Requirement already satisfied: graphviz in /root/venv/lib/python3.7/site-packages (from catboost) (0.19.1)
Requirement already satisfied: pandas>=0.24.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from catboost) (1.2.5)
Requirement already satisfied: cycler>=0.10 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->catboost) (0.11.0)
Requirement already satisfied: pyparsing>=2.2.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib->catboost) (3.0.7)
Requirement already satisfied: pillow>=6.2.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->catboost) (9.0.1)
Requirement already satisfied: packaging>=20.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib->catboost) (21.3)
Requirement already satisfied: python-dateutil>=2.7 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib->catboost) (2.8.2)
Requirement already satisfied: kiwisolver>=1.0.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->catboost) (1.3.2)
Requirement already satisfied: fonttools>=4.22.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->catboost) (4.29.1)
Requirement already satisfied: tenacity>=6.2.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from plotly->catboost) (8.0.1)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.24.0->catboost) (2021.3)
WARNING: You are using pip version 20.1.1; however, version 22.0.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.

from catboost import CatBoostClassifier # importing the function

cb = CatBoostClassifier(random_state=42,verbose=500,class_weights={0:1,1:5},max_depth=5,early_stopping_rounds=30,boosting_type='Ordered') # creating an instance

cb.fit(X_train,y_train) # fitting the model

Learning rate set to 0.011981
0:	learn: 0.6756274	total: 87.5ms	remaining: 1m 27s
500:	learn: 0.1157566	total: 8.33s	remaining: 8.29s
999:	learn: 0.0604226	total: 16.5s	remaining: 0us

cb_cv_f1 = cross_validate(cb,X_test,y_test,cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=42),scoring=scoring) # cross-validating our model over 5 folds and evaluting metrics are: accuracy, precision, recall and F-1 score

Learning rate set to 0.006019
0:	learn: 0.6857393	total: 16.8ms	remaining: 16.8s
500:	learn: 0.1806446	total: 6.67s	remaining: 6.64s
999:	learn: 0.1018346	total: 13s	remaining: 0us
Learning rate set to 0.006028
0:	learn: 0.6866775	total: 15.1ms	remaining: 15.1s
500:	learn: 0.1822175	total: 7.21s	remaining: 7.18s
999:	learn: 0.1087319	total: 14.1s	remaining: 0us
Learning rate set to 0.006028
0:	learn: 0.6867839	total: 15.7ms	remaining: 15.7s
500:	learn: 0.1680272	total: 7.19s	remaining: 7.16s
999:	learn: 0.0988092	total: 14s	remaining: 0us
Learning rate set to 0.006028
0:	learn: 0.6867894	total: 27.6ms	remaining: 27.6s
500:	learn: 0.1909167	total: 7.13s	remaining: 7.1s
999:	learn: 0.1079882	total: 14.1s	remaining: 0us
Learning rate set to 0.006028
0:	learn: 0.6872671	total: 18.6ms	remaining: 18.5s
500:	learn: 0.1850676	total: 6.91s	remaining: 6.88s
999:	learn: 0.1092527	total: 14.1s	remaining: 0us

print(f" ACCURACY: {cb_cv_f1['test_accuracy'].mean()}") print(f" PRECISION: {cb_cv_f1['test_precision'].mean()}") print(f" RECALL: {cb_cv_f1['test_recall'].mean()}") print(f" F-1 Score: {cb_cv_f1['test_f1_score'].mean()}")

 ACCURACY: 0.901643192488263
 PRECISION: 0.636451969083548
 RECALL: 0.8272727272727272
 F-1 Score: 0.7126429512516469

cb_pred = cb.predict(X_test) # predicting on the hold out test set

print(classification_report(y_test,cb_pred)) print(confusion_matrix(y_test,cb_pred)) print(precision_score(y_test,cb_pred)) print(recall_score(y_test,cb_pred)) print(f1_score(y_test,cb_pred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       304
           1       0.79      0.87      0.83        52

    accuracy                           0.95       356
   macro avg       0.88      0.91      0.90       356
weighted avg       0.95      0.95      0.95       356

[[292  12]
 [  7  45]]
0.7894736842105263
0.8653846153846154
0.8256880733944955

def correlation(dataset,threshold): col_corr = set() # empty set to avoid repittion later corr_matrix = dataset.corr() for i in range(len(corr_matrix.columns)): for j in range(i): if (corr_matrix.iloc[i,j]) > threshold: # abs is taken to consider highly negatively correlated columns as well colname = corr_matrix.columns[i] # getting the name of the column col_corr.add(colname) return col_corr

correlation(X_smote,0.7) # all those columns which ahve more than 70% collinearity

X_smote2 = X_smote.drop(list(correlation(X_smote,0.7)),axis=1) # removing all those columns which ahve more than 70% collinearity

from sklearn.feature_selection import mutual_info_classif # determine the mutual information mutual_info = mutual_info_classif(X_smote2, y_smote) mutual_info

mutual_info = pd.Series(mutual_info) # Creating series of column names and their respective mutual information gain mutual_info.index = X_smote2.columns # setting up index mutual_info.sort_values(ascending=False) # sorting the values

# Bar Plot of Mutual Information Gain with respect to our target variable plt.ylabel("Mutual Information Gain") plt.xlabel("Independent Features") plt.title("Mutual Information Gain of each feature with respect to target variable") mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 10));