Anomaly Detection

import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt

import matplotlib.pyplot as plt from matplotlib import image img = image.imread("/work/what-is-an-anomaly.jpg") plt.imshow(img) plt.show()

housing_data= pd.read_csv('/work/Melbourne_housing_FULL.csv') data= housing_data.copy() data

data.columns.values

data=data.dropna(axis=0)

sns.displot(data['YearBuilt']) plt.xlim(1800,2100)

def iqr_outlier(df , col): Q25 = np.quantile(df[col], 0.25) Q75= np.quantile(df[col], 0.75) IQR = Q75 - Q25 w_range = IQR * 1.5 # calculating the lower and upper bound value w_lower, w_upper = Q25 - w_range, Q75 + w_range print('The IQR is',IQR) print('The lower bound value is', w_lower) print('The upper bound value is', w_upper) # Calculating the number of outliers out1 = df[df[col] > w_upper] out2 = df[df[col] < w_lower] return print('Total number of outliers are', out1.shape[0]+ out2.shape[0])

iqr_outlier(data,'YearBuilt')

sns.boxplot(y='Rooms',data=data)

sns.boxplot(y='Price',data=data)

sns.boxplot(y='Landsize', data=data)

sns.boxplot(x='Distance', y='Type',data=data)

plt.scatter(data['Landsize'], data['BuildingArea'])

from sklearn.ensemble import IsolationForest columns = ['Price','Landsize','BuildingArea','Distance'] fig, axs = plt.subplots(1, 4, figsize=(20, 5), facecolor='w', edgecolor='k') axs = axs.ravel() for index, name in enumerate(columns): isolation_forest = IsolationForest(contamination='auto') isolation_forest.fit(data[name].values.reshape(-1,1)) xx = np.linspace(data[name].min(), data[name].max(), len(data)).reshape(-1,1) anomaly_score = isolation_forest.decision_function(xx) outlier = isolation_forest.predict(xx) axs[index].plot(xx, anomaly_score, label='anomaly score') axs[index].fill_between(xx.T[0], np.min(anomaly_score), np.max(anomaly_score), where=outlier==-1, color='r', alpha=.2, label='outlier region') axs[index].legend() axs[index].set_title(name)

from sklearn.neighbors import LocalOutlierFactor

LOF = LocalOutlierFactor(n_neighbors=50, contamination='auto') x = data[['Lattitude','Longtitude']].values y_pred = LOF.fit_predict(x)

plt.figure(figsize=(12,12)) # plotting the level sets of the decision function in_mask = [True if i == 1 else False for i in y_pred] out_mask = [True if i == -1 else False for i in y_pred] plt.title("Local Outlier Factor (LOF)") # inliers a = plt.scatter(x[in_mask, 0], x[in_mask, 1], c = 'blue', edgecolor = 'k', s = 30) # outliers b = plt.scatter(x[out_mask, 0], x[out_mask, 1], c = 'red', edgecolor = 'k', s = 30) plt.axis('tight') plt.xlabel('Latitude'); plt.ylabel('Longitude'); plt.show()

!pip install pyod==0.9.7

from scipy import stats import matplotlib.font_manager from pyod.models.knn import KNN from pyod.utils.data import generate_data, get_outliers_inliers

# generating a random dataset with two features X_train, y_train = generate_data(n_train = 300, train_only = True,n_features = 2) # Setting the percentage of outliers outlier_fraction = 0.1 # Storing the outliers and inliners in different numpy arrays X_outliers, X_inliers = get_outliers_inliers(X_train, y_train) n_inliers = len(X_inliers) n_outliers = len(X_outliers) # Separating the two features f1 = X_train[:, [0]].reshape(-1, 1) f2 = X_train[:, [1]].reshape(-1, 1)

# Visualising the dataset # create a meshgrid xx, yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 200)) # scatter plot plt.scatter(f1, f2) plt.xlabel('Feature 1') plt.ylabel('Feature 2')

# Training the classifier clf = KNN(contamination = outlier_fraction) clf.fit(X_train, y_train) scores_pred = clf.decision_function(X_train)*-1 y_pred = clf.predict(X_train) n_errors = (y_pred != y_train).sum() # Counting the number of errors print('The number of prediciton errors are ' + str(n_errors))

threshold = stats.scoreatpercentile(scores_pred, 100 * outlier_fraction) # decision function calculates the raw anomaly score for every point Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 Z = Z.reshape(xx.shape) # score to threshold value subplot = plt.subplot(1, 2, 1) # draw red contour line where anomaly score is equal to threshold a = subplot.contour(xx, yy, Z, levels =[threshold], linewidths = 2, colors ='red') # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score subplot.contourf(xx, yy, Z, levels =[threshold, Z.max()], colors ='orange') # scatter plot of inliers with white dots b = subplot.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1], c ='white', s = 20, edgecolor ='k') # scatter plot of outliers with black dots c = subplot.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1],c ='black', s = 20, edgecolor ='k') subplot.axis('tight') subplot.legend( [a.collections[0], b, c], ['learned decision function', 'true inliers', 'true outliers'], prop = matplotlib.font_manager.FontProperties(size = 10)) subplot.set_title('K-Nearest Neighbours') subplot.set_xlim((-10, 10)) subplot.set_ylim((-10, 10)) plt.show()