import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from matplotlib import image
img = image.imread("/work/what-is-an-anomaly.jpg")
plt.imshow(img)
plt.show()
housing_data= pd.read_csv('/work/Melbourne_housing_FULL.csv')
data= housing_data.copy()
data
Suburbobject
Reservoir2.4%
Bentleigh East1.7%
349 others95.9%
Addressobject
5 Charles St0%
25 William St0%
34007 others100%
0
Abbotsford
68 Studley St
1
Abbotsford
85 Turner St
2
Abbotsford
25 Bloomburg St
3
Abbotsford
18/659 Victoria St
4
Abbotsford
5 Charles St
5
Abbotsford
40 Federation La
6
Abbotsford
55a Park St
7
Abbotsford
16 Maugie St
8
Abbotsford
53 Turner St
9
Abbotsford
99 Turner St
data.columns.values
data=data.dropna(axis=0)
sns.displot(data['YearBuilt'])
plt.xlim(1800,2100)
def iqr_outlier(df , col):
Q25 = np.quantile(df[col], 0.25)
Q75= np.quantile(df[col], 0.75)
IQR = Q75 - Q25
w_range = IQR * 1.5
# calculating the lower and upper bound value
w_lower, w_upper = Q25 - w_range, Q75 + w_range
print('The IQR is',IQR)
print('The lower bound value is', w_lower)
print('The upper bound value is', w_upper)
# Calculating the number of outliers
out1 = df[df[col] > w_upper]
out2 = df[df[col] < w_lower]
return print('Total number of outliers are', out1.shape[0]+ out2.shape[0])
iqr_outlier(data,'YearBuilt')
The IQR is 55.0
The lower bound value is 1862.5
The upper bound value is 2082.5
Total number of outliers are 10
sns.boxplot(y='Rooms',data=data)
sns.boxplot(y='Price',data=data)
sns.boxplot(y='Landsize', data=data)
sns.boxplot(x='Distance', y='Type',data=data)
plt.scatter(data['Landsize'], data['BuildingArea'])
from sklearn.ensemble import IsolationForest
columns = ['Price','Landsize','BuildingArea','Distance']
fig, axs = plt.subplots(1, 4, figsize=(20, 5), facecolor='w', edgecolor='k')
axs = axs.ravel()
for index, name in enumerate(columns):
isolation_forest = IsolationForest(contamination='auto')
isolation_forest.fit(data[name].values.reshape(-1,1))
xx = np.linspace(data[name].min(), data[name].max(), len(data)).reshape(-1,1)
anomaly_score = isolation_forest.decision_function(xx)
outlier = isolation_forest.predict(xx)
axs[index].plot(xx, anomaly_score, label='anomaly score')
axs[index].fill_between(xx.T[0], np.min(anomaly_score), np.max(anomaly_score),
where=outlier==-1, color='r',
alpha=.2, label='outlier region')
axs[index].legend()
axs[index].set_title(name)
from sklearn.neighbors import LocalOutlierFactor
LOF = LocalOutlierFactor(n_neighbors=50, contamination='auto')
x = data[['Lattitude','Longtitude']].values
y_pred = LOF.fit_predict(x)
plt.figure(figsize=(12,12))
# plotting the level sets of the decision function
in_mask = [True if i == 1 else False for i in y_pred]
out_mask = [True if i == -1 else False for i in y_pred]
plt.title("Local Outlier Factor (LOF)")
# inliers
a = plt.scatter(x[in_mask, 0], x[in_mask, 1], c = 'blue', edgecolor = 'k', s = 30)
# outliers
b = plt.scatter(x[out_mask, 0], x[out_mask, 1], c = 'red', edgecolor = 'k', s = 30)
plt.axis('tight')
plt.xlabel('Latitude');
plt.ylabel('Longitude');
plt.show()
!pip install pyod==0.9.7
Collecting pyod==0.9.7
Downloading pyod-0.9.7.tar.gz (114 kB)
|████████████████████████████████| 114 kB 26.5 MB/s
Requirement already satisfied: joblib in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pyod==0.9.7) (1.1.0)
Requirement already satisfied: matplotlib in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pyod==0.9.7) (3.5.1)
Requirement already satisfied: numpy>=1.13 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pyod==0.9.7) (1.19.5)
Collecting numba>=0.35
Downloading numba-0.55.1-1-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.3 MB)
|████████████████████████████████| 3.3 MB 49.7 MB/s
Requirement already satisfied: scipy>=1.3.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pyod==0.9.7) (1.7.3)
Requirement already satisfied: scikit_learn>=0.20.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pyod==0.9.7) (1.0.2)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pyod==0.9.7) (1.16.0)
Collecting statsmodels
Downloading statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
|████████████████████████████████| 9.8 MB 58.2 MB/s
Requirement already satisfied: python-dateutil>=2.7 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib->pyod==0.9.7) (2.8.2)
Requirement already satisfied: fonttools>=4.22.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->pyod==0.9.7) (4.29.1)
Requirement already satisfied: pyparsing>=2.2.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib->pyod==0.9.7) (3.0.7)
Requirement already satisfied: packaging>=20.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib->pyod==0.9.7) (21.3)
Requirement already satisfied: cycler>=0.10 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->pyod==0.9.7) (0.11.0)
Requirement already satisfied: pillow>=6.2.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->pyod==0.9.7) (9.0.1)
Requirement already satisfied: kiwisolver>=1.0.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib->pyod==0.9.7) (1.3.2)
Collecting llvmlite<0.39,>=0.38.0rc1
Downloading llvmlite-0.38.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
|████████████████████████████████| 34.5 MB 61.2 MB/s
Requirement already satisfied: setuptools in /root/venv/lib/python3.7/site-packages (from numba>=0.35->pyod==0.9.7) (47.1.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit_learn>=0.20.0->pyod==0.9.7) (3.1.0)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels->pyod==0.9.7) (1.2.5)
Collecting patsy>=0.5.2
Downloading patsy-0.5.2-py2.py3-none-any.whl (233 kB)
|████████████████████████████████| 233 kB 56.4 MB/s
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels->pyod==0.9.7) (2021.3)
Building wheels for collected packages: pyod
Building wheel for pyod (setup.py) ... done
Created wheel for pyod: filename=pyod-0.9.7-py3-none-any.whl size=136265 sha256=00b5b81c3328f1db6b697f5c03c4fe8de40dc0d669c45cc207f5f39af8151fe0
Stored in directory: /root/.cache/pip/wheels/ce/14/ae/60cbb36511e59bc12f8f0883805f586db3b315972b54865d33
Successfully built pyod
Installing collected packages: llvmlite, numba, patsy, statsmodels, pyod
Successfully installed llvmlite-0.38.0 numba-0.55.1 patsy-0.5.2 pyod-0.9.7 statsmodels-0.13.2
WARNING: You are using pip version 20.1.1; however, version 22.0.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
from scipy import stats
import matplotlib.font_manager
from pyod.models.knn import KNN
from pyod.utils.data import generate_data, get_outliers_inliers
# generating a random dataset with two features
X_train, y_train = generate_data(n_train = 300, train_only = True,n_features = 2)
# Setting the percentage of outliers
outlier_fraction = 0.1
# Storing the outliers and inliners in different numpy arrays
X_outliers, X_inliers = get_outliers_inliers(X_train, y_train)
n_inliers = len(X_inliers)
n_outliers = len(X_outliers)
# Separating the two features
f1 = X_train[:, [0]].reshape(-1, 1)
f2 = X_train[:, [1]].reshape(-1, 1)
# Visualising the dataset
# create a meshgrid
xx, yy = np.meshgrid(np.linspace(-10, 10, 200),
np.linspace(-10, 10, 200))
# scatter plot
plt.scatter(f1, f2)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
# Training the classifier
clf = KNN(contamination = outlier_fraction)
clf.fit(X_train, y_train)
scores_pred = clf.decision_function(X_train)*-1
y_pred = clf.predict(X_train)
n_errors = (y_pred != y_train).sum()
# Counting the number of errors
print('The number of prediciton errors are ' + str(n_errors))
The number of prediciton errors are 0
/root/venv/lib/python3.7/site-packages/pyod/models/base.py:413: UserWarning: y should not be presented in unsupervised learning.
"y should not be presented in unsupervised learning.")
threshold = stats.scoreatpercentile(scores_pred, 100 * outlier_fraction)
# decision function calculates the raw anomaly score for every point
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
Z = Z.reshape(xx.shape)
# score to threshold value
subplot = plt.subplot(1, 2, 1)
# draw red contour line where anomaly score is equal to threshold
a = subplot.contour(xx, yy, Z, levels =[threshold],
linewidths = 2, colors ='red')
# fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
subplot.contourf(xx, yy, Z, levels =[threshold, Z.max()], colors ='orange')
# scatter plot of inliers with white dots
b = subplot.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1],
c ='white', s = 20, edgecolor ='k')
# scatter plot of outliers with black dots
c = subplot.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1],c ='black', s = 20, edgecolor ='k')
subplot.axis('tight')
subplot.legend(
[a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
prop = matplotlib.font_manager.FontProperties(size = 10))
subplot.set_title('K-Nearest Neighbours')
subplot.set_xlim((-10, 10))
subplot.set_ylim((-10, 10))
plt.show()