Pre-Requisites to Understanding this Article
What's an Outlier/Anomaly? ๐คท
Should we simply remove Outliers if they're one-off events?
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set_style("whitegrid")
mpl.rcParams['figure.figsize'] = (20,5)
dataframe = pd.read_csv(r'/work/Time_Series_Chemical_Machinery_Dataset.csv')
dataframe.set_index("Date (DD/MM/YYYY)", inplace=True)
ax1 = dataframe.plot(alpha=0.6)
ax1.xaxis.set_major_locator(plt.MaxNLocator(10))
ax2 = ax1.twinx()
ax2.plot(dataframe['Motor Trip Failure'],color='grey', marker='*')
ax2.xaxis.set_major_locator(plt.MaxNLocator(10))
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title("Timeseries Plot of Chemical Machinery Data")
plt.tight_layout()
plt.show()
Step Two - Identification of our Outliers.
from adtk.detector import InterQuartileRangeAD
from adtk.visualization import plot
iqr_ad = InterQuartileRangeAD(c=1.5)
example_df = pd.read_csv(r'/work/Time_Series_Chemical_Machinery_Dataset.csv', index_col="Date (DD/MM/YYYY)",parse_dates=True, dayfirst=True)
column_names = example_df.drop('Motor Trip Failure', axis=1).columns
for item in column_names:
anomalies = iqr_ad.fit_detect(example_df[item])
plot(example_df[item], anomaly=anomalies, ts_linewidth=3, ts_markersize=3, anomaly_markersize=5, anomaly_color='deeppink', anomaly_tag="marker")
example_df['Motor Trip Failure'].plot(color='black', marker="D", linewidth=1.5)
plt.title(item)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
Q1 = example_df.quantile(0.25)
Q3 = example_df.quantile(0.75)
IQR = Q3 - Q1
Lower_Limit = Q1 - 1.5 * IQR
Upper_Limit = Q3 + 1.5 * IQR
No_Outliers_Dataframe = example_df[~((example_df < Lower_Limit) | ((example_df > Upper_Limit ))).any(axis=1)]
import plotly.express as px
fig = px.box(No_Outliers_Dataframe[~No_Outliers_Dataframe['Motor Trip Failure']==1], title="Outliers Removed in the Chemical Machinery Dataset")
fig.show()
fig = px.box(example_df[example_df['Motor Trip Failure']==1], title="Outliers remaining in the Chemical Machinery Dataset")
fig.show()