import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from matplotlib import image
img = image.imread("/work/what-is-an-anomaly.jpg")
plt.imshow(img)
plt.show()
housing_data= pd.read_csv('/work/Melbourne_housing_FULL.csv')
data= housing_data.copy()
data
data.columns.values
data=data.dropna(axis=0)
sns.displot(data['YearBuilt'])
plt.xlim(1800,2100)
def iqr_outlier(df , col):
Q25 = np.quantile(df[col], 0.25)
Q75= np.quantile(df[col], 0.75)
IQR = Q75 - Q25
w_range = IQR * 1.5
# calculating the lower and upper bound value
w_lower, w_upper = Q25 - w_range, Q75 + w_range
print('The IQR is',IQR)
print('The lower bound value is', w_lower)
print('The upper bound value is', w_upper)
# Calculating the number of outliers
out1 = df[df[col] > w_upper]
out2 = df[df[col] < w_lower]
return print('Total number of outliers are', out1.shape[0]+ out2.shape[0])
iqr_outlier(data,'YearBuilt')
sns.boxplot(y='Rooms',data=data)
sns.boxplot(y='Price',data=data)
sns.boxplot(y='Landsize', data=data)
sns.boxplot(x='Distance', y='Type',data=data)
plt.scatter(data['Landsize'], data['BuildingArea'])
from sklearn.ensemble import IsolationForest
columns = ['Price','Landsize','BuildingArea','Distance']
fig, axs = plt.subplots(1, 4, figsize=(20, 5), facecolor='w', edgecolor='k')
axs = axs.ravel()
for index, name in enumerate(columns):
isolation_forest = IsolationForest(contamination='auto')
isolation_forest.fit(data[name].values.reshape(-1,1))
xx = np.linspace(data[name].min(), data[name].max(), len(data)).reshape(-1,1)
anomaly_score = isolation_forest.decision_function(xx)
outlier = isolation_forest.predict(xx)
axs[index].plot(xx, anomaly_score, label='anomaly score')
axs[index].fill_between(xx.T[0], np.min(anomaly_score), np.max(anomaly_score),
where=outlier==-1, color='r',
alpha=.2, label='outlier region')
axs[index].legend()
axs[index].set_title(name)
from sklearn.neighbors import LocalOutlierFactor
LOF = LocalOutlierFactor(n_neighbors=50, contamination='auto')
x = data[['Lattitude','Longtitude']].values
y_pred = LOF.fit_predict(x)
plt.figure(figsize=(12,12))
# plotting the level sets of the decision function
in_mask = [True if i == 1 else False for i in y_pred]
out_mask = [True if i == -1 else False for i in y_pred]
plt.title("Local Outlier Factor (LOF)")
# inliers
a = plt.scatter(x[in_mask, 0], x[in_mask, 1], c = 'blue', edgecolor = 'k', s = 30)
# outliers
b = plt.scatter(x[out_mask, 0], x[out_mask, 1], c = 'red', edgecolor = 'k', s = 30)
plt.axis('tight')
plt.xlabel('Latitude');
plt.ylabel('Longitude');
plt.show()
!pip install pyod==0.9.7
from scipy import stats
import matplotlib.font_manager
from pyod.models.knn import KNN
from pyod.utils.data import generate_data, get_outliers_inliers
# generating a random dataset with two features
X_train, y_train = generate_data(n_train = 300, train_only = True,n_features = 2)
# Setting the percentage of outliers
outlier_fraction = 0.1
# Storing the outliers and inliners in different numpy arrays
X_outliers, X_inliers = get_outliers_inliers(X_train, y_train)
n_inliers = len(X_inliers)
n_outliers = len(X_outliers)
# Separating the two features
f1 = X_train[:, [0]].reshape(-1, 1)
f2 = X_train[:, [1]].reshape(-1, 1)
# Visualising the dataset
# create a meshgrid
xx, yy = np.meshgrid(np.linspace(-10, 10, 200),
np.linspace(-10, 10, 200))
# scatter plot
plt.scatter(f1, f2)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
# Training the classifier
clf = KNN(contamination = outlier_fraction)
clf.fit(X_train, y_train)
scores_pred = clf.decision_function(X_train)*-1
y_pred = clf.predict(X_train)
n_errors = (y_pred != y_train).sum()
# Counting the number of errors
print('The number of prediciton errors are ' + str(n_errors))
threshold = stats.scoreatpercentile(scores_pred, 100 * outlier_fraction)
# decision function calculates the raw anomaly score for every point
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
Z = Z.reshape(xx.shape)
# score to threshold value
subplot = plt.subplot(1, 2, 1)
# draw red contour line where anomaly score is equal to threshold
a = subplot.contour(xx, yy, Z, levels =[threshold],
linewidths = 2, colors ='red')
# fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
subplot.contourf(xx, yy, Z, levels =[threshold, Z.max()], colors ='orange')
# scatter plot of inliers with white dots
b = subplot.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1],
c ='white', s = 20, edgecolor ='k')
# scatter plot of outliers with black dots
c = subplot.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1],c ='black', s = 20, edgecolor ='k')
subplot.axis('tight')
subplot.legend(
[a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
prop = matplotlib.font_manager.FontProperties(size = 10))
subplot.set_title('K-Nearest Neighbours')
subplot.set_xlim((-10, 10))
subplot.set_ylim((-10, 10))
plt.show()