import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
from scipy.stats import multivariate_normal
def gen_blob_data(num_rows=1000, ratio_of_outliers=0.05, scale=1, same_density=True, uniform_outlier=False):
mean1 = [10, 0]
mean2 = [0, 10]
if same_density:
cov1 = [[1, 0], [0, 1]]
cov2 = cov1
else:
cov1 = [[1, 0], [0, 1]]
cov2 = [[3, 0], [0, 3]]
def blob_pdf(x, y):
return 0.5*multivariate_normal.pdf([x, y], mean1, scale*np.array(cov1)) + 0.5*multivariate_normal.pdf([x, y], mean2, scale*np.array(cov2))
vblobpdf = np.vectorize(blob_pdf)
x1, y1 = np.random.multivariate_normal(mean1, cov1, int((1-ratio_of_outliers)*num_rows/2)).T
x2, y2 = np.random.multivariate_normal(mean2, cov2, int((1-ratio_of_outliers)*num_rows/2)).T
x, y = np.concatenate((x1,x2)), np.concatenate((y1,y2))
#artificial outliers
if not uniform_outlier:
ox1, oy1 = np.random.multivariate_normal([10, 10], cov2, int(ratio_of_outliers*num_rows/2)).T
ox2, oy2 = np.random.multivariate_normal([0, 0], cov2, int(ratio_of_outliers*num_rows/2)).T
ox, oy = np.concatenate((ox1,ox2)), np.concatenate((oy1,oy2))
else:
ox, oy = np.random.uniform(low=min(np.min(x), np.min(y)),
high=max(np.max(x), np.max(y)),
size=(2, int(ratio_of_outliers*num_rows)))
x, y = np.concatenate((x,ox)), np.concatenate((y,oy))
data = pd.DataFrame.from_dict({'x': x, 'y': y})
data['score'] = vblobpdf(x, y)
return data
gen_blob_data().head()
gen_blob_data().plot.scatter(x='x', y='y', c='score', cmap='RdBu')
plt.show()
gen_blob_data(scale=4).plot.scatter(x='x', y='y', c='score', cmap='RdBu')
plt.show()
gen_blob_data(scale=4, ratio_of_outliers=0.01).plot.scatter(x='x', y='y', c='score', cmap='RdBu')
plt.show()
gen_blob_data(scale=4, uniform_outlier=True).plot.scatter(x='x', y='y', c='score', cmap='RdBu')
plt.show()
gen_blob_data(scale=4, ratio_of_outliers=0.02, same_density=False).plot.scatter(x='x', y='y', c='score', cmap='RdBu')
plt.show()
x = np.linspace(0,1, 500)
y = 0.5*np.sin(20*x) + np.random.normal(0,0.25,len(x))
plt.scatter(x,y)
plt.show()
def sin_pdf(x, y):
return norm.pdf(y, loc=0.5*np.sin(20*x), scale=0.5)
vsinpdf = np.vectorize(sin_pdf)
x = np.arange(0, 1, 0.02)
y = np.arange(-1, 1, 0.02)
xx, yy = np.meshgrid(x, y, sparse=True)
z = vsinpdf(xx, yy)
h = plt.contourf(x,y,z)
plt.show()
def gen_sin_data(num_rows=1000, ratio_of_outliers=0.05, scale=0.5, freq=20):
def sin_pdf(x, y):
return norm.pdf(y, loc=0.5*np.sin(freq*x), scale=scale)
vsinpdf = np.vectorize(sin_pdf)
x = np.linspace(0,1, int((1-ratio_of_outliers)*num_rows))
y = 0.5*np.sin(freq*x) + np.random.normal(0, 0.5, len(x)) #here the scale is 0.5
#uniform random outliers
x_uni = np.linspace(0, 1, int(ratio_of_outliers*num_rows))
y_uni = np.random.uniform(np.min(y), np.max(y), len(x_uni))
x, y = np.concatenate((x,x_uni)), np.concatenate((y,y_uni))
data = pd.DataFrame.from_dict({'x': x, 'y': y})
data['score'] = vsinpdf(x, y)
return data
gen_sin_data().head()
data = gen_sin_data(1000, scale=0.5)
data.plot.scatter(x='x', y='y', c='score', cmap='RdBu')
plt.show()
data = gen_sin_data(1000, scale=1)
data.plot.scatter(x='x', y='y', c='score', cmap='RdBu')
plt.show()
def gen_ring_data(num_rows=1000, ratio_of_outliers=0.1, scale=0.5, radius=2):
def ring_pdf(x, y):
return norm.pdf(np.sqrt(x**2 + y**2), loc=radius, scale=scale)
vringpdf = np.vectorize(ring_pdf)
# random angles
alpha = 2 * np.pi * np.random.rand(int((1-ratio_of_outliers)*num_rows))
x = radius * np.cos(alpha) + np.random.normal(0, 0.5, len(alpha))
y = radius * np.sin(alpha) + np.random.normal(0, 0.5, len(alpha))
#uniform random outliers
x_uni = np.random.uniform(np.min(x), np.max(x), int(ratio_of_outliers*num_rows))
y_uni = np.random.uniform(np.min(y), np.max(y), len(x_uni))
x, y = np.concatenate((x,x_uni)), np.concatenate((y,y_uni))
data = pd.DataFrame.from_dict({'x': x, 'y': y})
data['score'] = vringpdf(x, y)
return data
gen_ring_data().plot.scatter(x='x', y='y', c='score', cmap='RdBu')
plt.show()
gen_ring_data(scale=1).plot.scatter(x='x', y='y', c='score', cmap='RdBu')
plt.show()
gen_ring_data(scale=1, radius=4).plot.scatter(x='x', y='y', c='score', cmap='RdBu')
plt.show()