Website Classification - Malicious vs. Benign

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv("train.csv") # importing the dataset

df.head() # checking the head

df.info() # checking the info

df.isna().sum() # Checking the missing values per column

100 * df['Type'].value_counts()/len(df)

df.describe() # checking the descriptive stats for each numerical feature

plt.figure(figsize=(12,6)) sns.boxplot(data=df,x='Type',y='NUMBER_SPECIAL_CHARACTERS');

plt.figure(figsize=(12,6)) sns.boxplot(data=df,x='Type',y='URL_LENGTH');

for i in df.select_dtypes(include='object').columns: print(f"{i} -> {df[i].nunique()}")

df['CHARSET'].value_counts()

def CHARSET_CLEANER(x): if x not in ['UTF-8','ISO-8859-1','utf-8','us-ascii','iso-8859-1']: return "OTHERS" else: return x

df['CHARSET'] = df['CHARSET'].apply(CHARSET_CLEANER)

df['CHARSET'].value_counts()

df['SERVER'].value_counts()

def SERVER_CLEANER(x): if x not in ['Apache','nginx','None','Microsoft-HTTPAPI/2.0','cloudflare-nginx']: return "OTHERS" else: return x

df['SERVER'] = df['SERVER'].apply(SERVER_CLEANER)

df['SERVER'].value_counts()

df['WHOIS_STATEPRO'].value_counts()[:7]

def STATE_CLEANER(x): if x not in ['CA','None','NY','WA','Barcelona','FL']: return "OTHERS" else: return x

df['WHOIS_STATEPRO'] = df['WHOIS_STATEPRO'].apply(STATE_CLEANER)

df['WHOIS_STATEPRO'].value_counts()

def DATE_CLEANER(x): if x == 'None': return "Absent" else: return "Present"

df['WHOIS_REGDATE'] = df['WHOIS_REGDATE'].apply(DATE_CLEANER)

df['WHOIS_UPDATED_DATE'] = df['WHOIS_UPDATED_DATE'].apply(DATE_CLEANER)

df.head()

df.drop(['URL','WHOIS_COUNTRY'],axis=1,inplace=True)

df.head()

plt.figure(figsize=(20,10)) sns.heatmap(data=df.corr(),cmap='plasma',annot=True)

df2 = df.copy() # creating a copy of our dataframe

df2.drop("CONTENT_LENGTH",axis=1,inplace=True) # dropping the column which is not required

df3 = df2.copy() # creating a copy of the dataframe

df3 = pd.get_dummies(df3,columns=['WHOIS_UPDATED_DATE','WHOIS_REGDATE','WHOIS_STATEPRO','SERVER','CHARSET'],drop_first=True) # creating dummies

df3.head() # checking the head

df3.isna().sum() # checking for any missing value

df3.dropna(inplace=True) # dropping all the missing values

!pip install imblearn

# Importing the SMOTE function from imblearn.over_sampling import SMOTE

# Creating the set of independent features and target variable X = df3.drop("Type",axis=1) y = df3['Type']

from imblearn.under_sampling import RandomUnderSampler # importing the Under Sampling function

# We shall keep undersampled majority class 50% more than the oversampled minority class. # This is being done on order to resemble the composition of original dataframe in the SMOTE's dataframe undersample = RandomUnderSampler(sampling_strategy=0.5)

from imblearn.pipeline import Pipeline # Importing the pipeline

# Initializing the SMOTE function. We set our SMOTE function to oversample the minority to the number equal to the majority class. #Then, we take 50% of the oversampled minority class (randomly sampled). oversample = SMOTE(sampling_strategy=0.5)

steps = [('o',oversample),('u',undersample)] # steps for pipelining. First "do oversampling of the minority class" and then do "undersampling of the majority class"

pipeline = Pipeline(steps=steps) # Creating the pipeline instance

X_smote, y_smote = pipeline.fit_resample(X,y) # Fitting the pipeline to our dataset

y_smote.value_counts() # Taking value counts of the targte feature

len(X_smote) # checking the total number of samples we have

X_smote.shape # checking the shape

from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,cross_validate # Implementing the required functions

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # creating a test holdout set

from sklearn.preprocessing import StandardScaler # import the standard scaling function

sc = StandardScaler() # creating an instance of the scaling function

X_train = sc.fit_transform(X_train) # fitting and transform the training set X_test = sc.transform(X_test) # just transforming the testing set to avoid 'data leakage'

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score,fbeta_score,make_scorer,precision_score,recall_score # importing all the metric scores required for evaluation

# creating a dictionary to evaluate metric over stratified k-fold cv scoring = {'accuracy' : make_scorer(accuracy_score), 'precision' : make_scorer(precision_score), 'recall' : make_scorer(recall_score), 'f1_score' : make_scorer(f1_score)}

!pip install sklearn

from sklearn.ensemble import RandomForestClassifier # importing the function

rf = RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=42,class_weight={0:1,1:5},max_depth=5) # creating an instance

rf.fit(X_train,y_train) # fitting the model

rf_cv_f1 = cross_validate(rf,X_test,y_test,cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=111),scoring=scoring) # cross-validating our model over 5 folds and evaluting metrics are: accuracy, precision, recall and F-1 score

print(f" ACCURACY: {rf_cv_f1['test_accuracy'].mean()}") print(f" PRECISION: {rf_cv_f1['test_precision'].mean()}") print(f" RECALL: {rf_cv_f1['test_recall'].mean()}") print(f" F-1 Score: {rf_cv_f1['test_f1_score'].mean()}")

rf_pred = rf.predict(X_test) # predicting on the hold out test set

print(classification_report(y_test,rf_pred)) print(confusion_matrix(y_test,rf_pred))

!pip install catboost

from catboost import CatBoostClassifier # importing the function

cb = CatBoostClassifier(random_state=42,verbose=500,class_weights={0:1,1:5},max_depth=5,early_stopping_rounds=30,boosting_type='Ordered') # creating an instance

cb.fit(X_train,y_train) # fitting the model

cb_cv_f1 = cross_validate(cb,X_test,y_test,cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=42),scoring=scoring) # cross-validating our model over 5 folds and evaluting metrics are: accuracy, precision, recall and F-1 score

print(f" ACCURACY: {cb_cv_f1['test_accuracy'].mean()}") print(f" PRECISION: {cb_cv_f1['test_precision'].mean()}") print(f" RECALL: {cb_cv_f1['test_recall'].mean()}") print(f" F-1 Score: {cb_cv_f1['test_f1_score'].mean()}")

cb_pred = cb.predict(X_test) # predicting on the hold out test set

print(classification_report(y_test,cb_pred)) print(confusion_matrix(y_test,cb_pred)) print(precision_score(y_test,cb_pred)) print(recall_score(y_test,cb_pred)) print(f1_score(y_test,cb_pred))

def correlation(dataset,threshold): col_corr = set() # empty set to avoid repittion later corr_matrix = dataset.corr() for i in range(len(corr_matrix.columns)): for j in range(i): if (corr_matrix.iloc[i,j]) > threshold: # abs is taken to consider highly negatively correlated columns as well colname = corr_matrix.columns[i] # getting the name of the column col_corr.add(colname) return col_corr

correlation(X_smote,0.7) # all those columns which ahve more than 70% collinearity

X_smote2 = X_smote.drop(list(correlation(X_smote,0.7)),axis=1) # removing all those columns which ahve more than 70% collinearity

from sklearn.feature_selection import mutual_info_classif # determine the mutual information mutual_info = mutual_info_classif(X_smote2, y_smote) mutual_info

mutual_info = pd.Series(mutual_info) # Creating series of column names and their respective mutual information gain mutual_info.index = X_smote2.columns # setting up index mutual_info.sort_values(ascending=False) # sorting the values

# Bar Plot of Mutual Information Gain with respect to our target variable plt.ylabel("Mutual Information Gain") plt.xlabel("Independent Features") plt.title("Mutual Information Gain of each feature with respect to target variable") mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 10));