import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("train.csv") # importing the dataset
df.head() # checking the head
df.info() # checking the info
df.isna().sum() # Checking the missing values per column
100 * df['Type'].value_counts()/len(df)
df.describe() # checking the descriptive stats for each numerical feature
plt.figure(figsize=(12,6))
sns.boxplot(data=df,x='Type',y='NUMBER_SPECIAL_CHARACTERS');
plt.figure(figsize=(12,6))
sns.boxplot(data=df,x='Type',y='URL_LENGTH');
for i in df.select_dtypes(include='object').columns:
print(f"{i} -> {df[i].nunique()}")
df['CHARSET'].value_counts()
def CHARSET_CLEANER(x):
if x not in ['UTF-8','ISO-8859-1','utf-8','us-ascii','iso-8859-1']:
return "OTHERS"
else:
return x
df['CHARSET'] = df['CHARSET'].apply(CHARSET_CLEANER)
df['CHARSET'].value_counts()
df['SERVER'].value_counts()
def SERVER_CLEANER(x):
if x not in ['Apache','nginx','None','Microsoft-HTTPAPI/2.0','cloudflare-nginx']:
return "OTHERS"
else:
return x
df['SERVER'] = df['SERVER'].apply(SERVER_CLEANER)
df['SERVER'].value_counts()
df['WHOIS_STATEPRO'].value_counts()[:7]
def STATE_CLEANER(x):
if x not in ['CA','None','NY','WA','Barcelona','FL']:
return "OTHERS"
else:
return x
df['WHOIS_STATEPRO'] = df['WHOIS_STATEPRO'].apply(STATE_CLEANER)
df['WHOIS_STATEPRO'].value_counts()
def DATE_CLEANER(x):
if x == 'None':
return "Absent"
else:
return "Present"
df['WHOIS_REGDATE'] = df['WHOIS_REGDATE'].apply(DATE_CLEANER)
df['WHOIS_UPDATED_DATE'] = df['WHOIS_UPDATED_DATE'].apply(DATE_CLEANER)
df.head()
df.drop(['URL','WHOIS_COUNTRY'],axis=1,inplace=True)
df.head()
plt.figure(figsize=(20,10))
sns.heatmap(data=df.corr(),cmap='plasma',annot=True)
df2 = df.copy() # creating a copy of our dataframe
df2.drop("CONTENT_LENGTH",axis=1,inplace=True) # dropping the column which is not required
df3 = df2.copy() # creating a copy of the dataframe
df3 = pd.get_dummies(df3,columns=['WHOIS_UPDATED_DATE','WHOIS_REGDATE','WHOIS_STATEPRO','SERVER','CHARSET'],drop_first=True) # creating dummies
df3.head() # checking the head
df3.isna().sum() # checking for any missing value
df3.dropna(inplace=True) # dropping all the missing values
!pip install imblearn
# Importing the SMOTE function
from imblearn.over_sampling import SMOTE
# Creating the set of independent features and target variable
X = df3.drop("Type",axis=1)
y = df3['Type']
from imblearn.under_sampling import RandomUnderSampler # importing the Under Sampling function
# We shall keep undersampled majority class 50% more than the oversampled minority class.
# This is being done on order to resemble the composition of original dataframe in the SMOTE's dataframe
undersample = RandomUnderSampler(sampling_strategy=0.5)
from imblearn.pipeline import Pipeline # Importing the pipeline
# Initializing the SMOTE function. We set our SMOTE function to oversample the minority to the number equal to the majority class.
#Then, we take 50% of the oversampled minority class (randomly sampled).
oversample = SMOTE(sampling_strategy=0.5)
steps = [('o',oversample),('u',undersample)] # steps for pipelining. First "do oversampling of the minority class" and then do "undersampling of the majority class"
pipeline = Pipeline(steps=steps) # Creating the pipeline instance
X_smote, y_smote = pipeline.fit_resample(X,y) # Fitting the pipeline to our dataset
y_smote.value_counts() # Taking value counts of the targte feature
len(X_smote) # checking the total number of samples we have
X_smote.shape # checking the shape
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,cross_validate # Implementing the required functions
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # creating a test holdout set
from sklearn.preprocessing import StandardScaler # import the standard scaling function
sc = StandardScaler() # creating an instance of the scaling function
X_train = sc.fit_transform(X_train) # fitting and transform the training set
X_test = sc.transform(X_test) # just transforming the testing set to avoid 'data leakage'
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score,fbeta_score,make_scorer,precision_score,recall_score
# importing all the metric scores required for evaluation
# creating a dictionary to evaluate metric over stratified k-fold cv
scoring = {'accuracy' : make_scorer(accuracy_score),
'precision' : make_scorer(precision_score),
'recall' : make_scorer(recall_score),
'f1_score' : make_scorer(f1_score)}
!pip install sklearn
from sklearn.ensemble import RandomForestClassifier # importing the function
rf = RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=42,class_weight={0:1,1:5},max_depth=5) # creating an instance
rf.fit(X_train,y_train) # fitting the model
rf_cv_f1 = cross_validate(rf,X_test,y_test,cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=111),scoring=scoring)
# cross-validating our model over 5 folds and evaluting metrics are: accuracy, precision, recall and F-1 score
print(f" ACCURACY: {rf_cv_f1['test_accuracy'].mean()}")
print(f" PRECISION: {rf_cv_f1['test_precision'].mean()}")
print(f" RECALL: {rf_cv_f1['test_recall'].mean()}")
print(f" F-1 Score: {rf_cv_f1['test_f1_score'].mean()}")
rf_pred = rf.predict(X_test) # predicting on the hold out test set
print(classification_report(y_test,rf_pred))
print(confusion_matrix(y_test,rf_pred))
!pip install catboost
from catboost import CatBoostClassifier # importing the function
cb = CatBoostClassifier(random_state=42,verbose=500,class_weights={0:1,1:5},max_depth=5,early_stopping_rounds=30,boosting_type='Ordered') # creating an instance
cb.fit(X_train,y_train) # fitting the model
cb_cv_f1 = cross_validate(cb,X_test,y_test,cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=42),scoring=scoring)
# cross-validating our model over 5 folds and evaluting metrics are: accuracy, precision, recall and F-1 score
print(f" ACCURACY: {cb_cv_f1['test_accuracy'].mean()}")
print(f" PRECISION: {cb_cv_f1['test_precision'].mean()}")
print(f" RECALL: {cb_cv_f1['test_recall'].mean()}")
print(f" F-1 Score: {cb_cv_f1['test_f1_score'].mean()}")
cb_pred = cb.predict(X_test) # predicting on the hold out test set
print(classification_report(y_test,cb_pred))
print(confusion_matrix(y_test,cb_pred))
print(precision_score(y_test,cb_pred))
print(recall_score(y_test,cb_pred))
print(f1_score(y_test,cb_pred))
def correlation(dataset,threshold):
col_corr = set() # empty set to avoid repittion later
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if (corr_matrix.iloc[i,j]) > threshold: # abs is taken to consider highly negatively correlated columns as well
colname = corr_matrix.columns[i] # getting the name of the column
col_corr.add(colname)
return col_corr
correlation(X_smote,0.7) # all those columns which ahve more than 70% collinearity
X_smote2 = X_smote.drop(list(correlation(X_smote,0.7)),axis=1) # removing all those columns which ahve more than 70% collinearity
from sklearn.feature_selection import mutual_info_classif
# determine the mutual information
mutual_info = mutual_info_classif(X_smote2, y_smote)
mutual_info
mutual_info = pd.Series(mutual_info) # Creating series of column names and their respective mutual information gain
mutual_info.index = X_smote2.columns # setting up index
mutual_info.sort_values(ascending=False) # sorting the values
# Bar Plot of Mutual Information Gain with respect to our target variable
plt.ylabel("Mutual Information Gain")
plt.xlabel("Independent Features")
plt.title("Mutual Information Gain of each feature with respect to target variable")
mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 10));