%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import sys
from matplotlib import pyplot as plt
from pathlib import Path
from tsfresh.feature_extraction import extract_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import MinimalFCParameters
from argparse import ArgumentParser
from module import *
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline
%config InlineBackend.figure_format = 'png'
data_path = Path('data')
random_state = 1
n_jobs = None
events_names = {0: 'Normal',
1: 'Abrupt Increase of BSW',
2: 'Spurious Closure of DHSV',
3: 'Severe Slugging',
4: 'Flow Instability',
5: 'Rapid Productivity Loss',
6: 'Quick Restriction in PCK',
7: 'Scaling in PCK',
8: 'Hydrate in Production Line'
}
vars = ['P-PDG',
'P-TPT',
'T-TPT',
'P-MON-CKP',
'T-JUS-CKP',
'P-JUS-CKGL',
'T-JUS-CKGL',
'QGL']
#parsing the parameters
args = ArgumentParser()
args.columns = ['timestamp'] + vars + ['class']
args.normal_class_code = 0
args.undesirable_event_code = 1 # Undesirable event of interest
abnormal_classes_codes = [0, 1, 2, 3, 4, 5, 6, 7, 8]
downsample_rate = 30 # Adjusts frequency of sampling to the dynamics
# of the undesirable event of interest
args.sample_size_default = 60 # In observations (after downsample)
args.sample_size_normal_period = 5 # In observations (after downsample)
args.max_samples_per_period = 15 # Limitation for safety
max_nan_percent = 0.15 # For selection of useful variables
std_vars_min = 0.01 # For selection of useful variables
disable_progressbar = True # For less output
# Gets all real instances but maintains only those with any type of undesirable event
#real_instances = get_instances_with_undesirable_event(data_path, args.undesirable_event_code,real=True,simulated=False, drawn=False)
real_instances = pd.DataFrame(class_and_file_generator(data_path,real=True,simulated=False,drawn=False),columns=['class_code', 'instance_path'])
real_instances = real_instances.loc[real_instances.iloc[:,0].isin(abnormal_classes_codes)].reset_index(drop=True)
sim_instances = pd.DataFrame(class_and_file_generator(data_path,real=False,simulated=True,drawn=False),columns=['class_code', 'instance_path'])
sim_instances = sim_instances.loc[sim_instances.iloc[:,0].isin(abnormal_classes_codes)].reset_index(drop=True)
drawn_instances = pd.DataFrame(class_and_file_generator(data_path,real=False,simulated=False,drawn=True),columns=['class_code', 'instance_path'])
drawn_instances = drawn_instances.loc[drawn_instances.iloc[:,0].isin(abnormal_classes_codes)].reset_index(drop=True)
instances = pd.concat([real_instances, sim_instances, drawn_instances]).reset_index(drop=True)
print(f"number of total instances = {len(real_instances)+ len(sim_instances) + len(drawn_instances)}")
print(f"number of real instances = {len(real_instances)}")
print(f"number of simulated instances = {len(sim_instances)}")
print(f"number of drawn instances = {len(drawn_instances)}")
# Loads all real, simulated and drawn instances and applies downsample
instance_id = 0
df_real_instances, instance_id = load_and_downsample_instances(real_instances, downsample_rate, 'real', instance_id,args)
df_simul_instances, instance_id = load_and_downsample_instances(sim_instances,downsample_rate,'simulated', instance_id,args)
df_drawn_instances, instance_id = load_and_downsample_instances(drawn_instances,downsample_rate,'drawn',instance_id, args)
df_instances = pd.concat([df_real_instances, df_simul_instances, df_drawn_instances])
idxs = (df_instances['source']=='real') & (df_instances['instance_id'])
good_vars = np.isnan(df_instances.loc[idxs][vars]).mean(0) <= max_nan_percent
good_vars = list(good_vars.index[good_vars])
bad_vars = list(set(vars)-set(good_vars))
df_instances_good_vars = df_instances.drop(columns=bad_vars, errors='ignore')
print(good_vars)
# df_instances_logical = logical_threshold(df_instances_good_vars)
df_instances_logical = logical_threshold(df_instances_good_vars, eliminate=False)
# Totally eliminated
elim_set_total = set(df_instances_good_vars['instance_id']) - set(df_instances_logical['instance_id'])
# Partially eliminated
loss = {}
for id in sorted(list(set(df_instances_good_vars['instance_id']).intersection(set(df_instances_logical['instance_id'])))):
before =len(df_instances_good_vars[df_instances_good_vars['instance_id']==id])
after =len(df_instances_logical[df_instances_logical['instance_id']==id])
if before != after:
loss[id] = before - after
elim_set_partial = set(loss.keys())
for var in list(elim_set_partial):
df_instances_logical = df_instances_logical[~(df_instances_logical['instance_id'] == var)]
elim_set = (elim_set_total | elim_set_partial)
elim_id = sorted(list(elim_set))
elim_instances = instances.loc[elim_id]
logical_set = (set(real_instances.index) - elim_set)
#clean_set = (set(instances.index) - elim_set)
logical_id = sorted(list(logical_set))
logical_instances = instances.loc[logical_id]
max_frozen_percent = 0.1
std_vars_min = 0.0001
frozen = np.zeros(5)
for id in logical_id:
scaler = MinMaxScaler()
idxs = (df_instances_logical['instance_id']==id)
scaler.fit(df_instances_logical[idxs][good_vars])
df_scaled = scaler.fit_transform(df_instances_logical[idxs][good_vars])
df_scaled = pd.DataFrame(df_scaled, columns=good_vars)
frozen += np.nanstd(df_scaled[good_vars], 0) < std_vars_min
clean_vars= np.array(good_vars)
clean_vars = list(clean_vars[(frozen / len(logical_id) <= max_frozen_percent)])
frozen_vars = list(set(good_vars) - set(clean_vars))
df_instances_clean_frozen = df_instances_logical.drop(columns=frozen_vars, errors='ignore')
print(clean_vars)
eliminate = False
elim_clean_id = []
in_frozen = 0
in_total_frozen = 0
for id in logical_id:
scaler = MinMaxScaler()
idxs = (df_instances_clean_frozen['instance_id']==id)
scaler.fit(df_instances_clean_frozen[idxs][clean_vars])
df_scaled = scaler.fit_transform(df_instances_clean_frozen[idxs][clean_vars])
df_scaled = pd.DataFrame(df_scaled, columns=clean_vars)
if True in (np.nanstd(df_scaled[clean_vars], 0) < std_vars_min):
if eliminate:
elim_clean_id.append(id)
else:
pass
in_frozen += (df_instances_clean_frozen['instance_id']==id)
if not eliminate:
clean_vars_instance = np.array(clean_vars)
df_instances_clean_frozen.loc[in_frozen.astype(bool),clean_vars_instance[np.nanstd(df_scaled[clean_vars], 0) < std_vars_min]] = 0
if len(set(clean_vars_instance[np.nanstd(df_scaled[clean_vars], 0) < std_vars_min])) == len(clean_vars):
elim_clean_id.append(id)
in_total_frozen += (df_instances_clean_frozen['instance_id']==id)
clean_id = list((set(logical_id) - set(elim_clean_id)))
if eliminate:
df_instances_clean = df_instances_clean_frozen[~in_frozen.astype(bool)]
else:
if not isinstance(in_total_frozen, int):
df_instances_clean = df_instances_clean_frozen[~in_total_frozen.astype(bool)]
else:
df_instances_clean = df_instances_clean_frozen
df_instances_clean[clean_vars] = df_instances_clean[clean_vars].interpolate(method='linear')
# with open('./cached/before_lstm.pkl', 'wb') as f:
# pickle.dump(df_instances_clean, f)
# with open('./cached/before_lstm.pkl', 'rb') as f:
# df_instances_clean = pickle.load(f)
from torch.utils.data import DataLoader, Dataset, random_split
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import torch.nn.functional as F
from torch.autograd import Variable
from argparse import ArgumentParser
from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler
import os
from copy import copy
df_mission2 = df_instances_clean
# 모델 및 학습 파라미터 설정
args = ArgumentParser()
args.columns = ['timestamp'] + vars + ['class']
args.normal_class_code = 0
args.undesirable_event_code = 1 # Undesirable event of interest
abnormal_classes_codes = [0, 1, 2, 3, 4, 5, 6, 7, 8]
args.num_epoch = 200
args.batch_size = 64
args.learning_rate = 0.003
args.training_ratio = 0.7
args.validation_ratio = 0.15
args.test_ratio = 0.15
args.input_size = 3
args.window_size = 3
args.hidden_size = 128
args.num_layers = 2
args.num_classes = 2 # Normal / Transient / Abnormal # pandas DataFrame에서 one-hot vector로 붙이기
# Dataset 객체 정의 (상속으로)
class LSTMDataset(Dataset):
def __init__(self, data: pd.DataFrame, window_size, input_vars, output_len=1, predict=None, ignore_nan=True, scaler=None):
# dataframe: timestamp, 측정 변수들, class (Normal, Transient, Abnormal), instance_id, source
if ignore_nan:
nan = data.isnull()['class']
self.data = data[~nan]
self.data.loc[self.data['class'] != 0, 'class'] = 1 # for binary classification
else:
self.data = data
if scaler:
self.data.loc[:, input_vars] = scaler.fit_transform(self.data[input_vars])
self.window_size = window_size
self.input_vars = input_vars
self.output_len = output_len
self.ignore_nan = ignore_nan
self.predict = predict
def __len__(self):
return len(self.data) - max(self.output_len, self.window_size)
#return len(self.data) - self.output_len
def __getitem__(self, idx):
self.x = np.array([])
self.y = np.array([])
self.x = self.data[self.input_vars].iloc[idx:idx + self.window_size, ::].to_numpy()
if self.predict:
idx_out = idx+self.window_size+self.predict # last + predict
else:
idx_out = idx+self.window_size # last
self.y = pd.get_dummies(self.data['class']).iloc[idx_out:idx_out+self.output_len, ::].to_numpy().squeeze()
return torch.FloatTensor(self.x), torch.FloatTensor(self.y)
#return self.dataframe.iloc[idx] self.dataframe.iloc[idx][-3:-1] # return data, label (One-hot vector of Normal, Transient, Abnormal)
# LSTM 모델 정의: https://velog.io/@choonsik_mom/pytorch%EB%A1%9C-LSTM-%EA%B5%AC%ED%98%84%ED%95%98%EA%B8%B0
class LSTM(nn.Module):
def __init__(self, num_classes, input_size, hidden_size, num_layers):
super(LSTM, self).__init__()
self.num_classes = num_classes
self.num_layers = num_layers
self.input_size = input_size
self.hidden_size = hidden_size
# self.seq_length = seq_length
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
num_layers=num_layers, batch_first=True)
self.fc = nn.Sequential(
nn.Linear(hidden_size, int(hidden_size/4)),
nn.BatchNorm1d(int(hidden_size/4)),
#nn.ReLU(inplace=True),
nn.Dropout(0.2),
nn.Linear(int(hidden_size/4), num_classes)
)
self.init_weight()
def init_weight(self):
for param in self.parameters():
nn.init.normal(param.data)
def forward(self, x):
h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
# Propagate input through LSTM
_, (h, _) = self.lstm(x, (h_0, c_0))
out = self.fc(h[-1].view(-1, self.hidden_size))
#out = F.sigmoid(out)
return out
def make_scaler(id_set, df, vars):
idx=False
for id in id_set:
idx +=df['instance_id'] == id
only = df[idx][vars]
scaler = StandardScaler()
scaler.fit(only)
return scaler
def add_noise(original, std = 0.01):
return original + std * torch.rand(1)
def use_synthetic(df, input_vars, sim_idx,drawn_idx, sim_use=True, drawn_use=True):
sim_available=False
drawn_available=False
sc_sim = None
sc_drawn = None
if len(sim_idx) !=0:
sim_available = True
print('Valid simulated dataset')
if sim_use:
sc_sim = make_scaler(sim_idx, df, input_vars)
if len(drawn_idx) !=0:
drawn_available = True
print('Valid drawn dataset')
if drawn_use:
sc_drawn = make_scaler(drawn_idx, df, input_vars)
return sc_sim, sc_drawn
target_code = 2
args.sim_use = True
args.drawn_use = False
real_idx = list(real_instances[real_instances['class_code'] ==target_code].index.values)
sim_idx = list(sim_instances[sim_instances['class_code'] ==target_code].index.values + 1025)
drawn_idx = list(drawn_instances[drawn_instances['class_code'] ==target_code].index.values + 1963)
model = LSTM(num_classes=args.num_classes, input_size=args.input_size, hidden_size=args.hidden_size, num_layers=args.num_layers)
## id 나누기 (real만)
train_id, test_id, valid_id = random_split(real_idx, [args.training_ratio, args.validation_ratio, args.test_ratio])
# 학습 정의
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
# Scaling
scaler = make_scaler(train_id, df_mission2, clean_vars)
scaler_sim, scaler_drawn = use_synthetic(df_mission2, clean_vars, sim_idx,drawn_idx, sim_use=args.sim_use, drawn_use=args.drawn_use)
train_num = len(train_id)
if args.sim_use:
train_num += len(sim_idx)
if args.drawn_use:
train_num += len(drawn_idx)
print(f'Train: {train_num} \nValid: {len(valid_id)}\nTest: {len(test_id)}')
logger = {'t_loss':[], 'v_loss':[], 't_acc':[], 'v_acc':[]}
min_valid_loss = 1e8
for epoch in tqdm(range(args.num_epoch)):
train_loss = 0.0
train_acc_list = []
for id in train_id:
df_tmp = df_mission2[df_mission2['instance_id']==id]
ds = LSTMDataset(data=df_tmp, window_size=args.window_size, input_vars=clean_vars,scaler=scaler)
temp_dataloader = DataLoader(ds, batch_size=args.batch_size, shuffle=False)
train_loss_tmp = 0.0
train_acc_tmp = 0
len_tmp = 0
for batch in temp_dataloader:
x, y = batch
noised_x = add_noise(x)
pred = model(noised_x)
loss = criterion(pred, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
_, pred_labels = torch.max(pred, dim=1)
_, true_labels = torch.max(y, dim=1)
train_acc_tmp += torch.sum(pred_labels == true_labels)
len_tmp += len(x)
train_loss_tmp += loss.item()
train_loss_tmp /= len(temp_dataloader)
train_loss += train_loss_tmp
train_acc_list.append(float(train_acc_tmp / len_tmp))
if args.sim_use and (len(sim_idx) != 0):
for id in sim_idx:
df_tmp = df_mission2[df_mission2['instance_id']==id]
ds = LSTMDataset(data=df_tmp, window_size=args.window_size, input_vars=clean_vars,scaler=scaler_sim)
temp_dataloader = DataLoader(ds, batch_size=args.batch_size, shuffle=False)
train_loss_tmp = 0.0
train_acc_tmp = 0
len_tmp = 0
for batch in temp_dataloader:
x, y = batch
noised_x = add_noise(x)
pred = model(noised_x)
loss = criterion(pred, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
_, pred_labels = torch.max(pred, dim=1)
_, true_labels = torch.max(y, dim=1)
train_acc_tmp += torch.sum(pred_labels == true_labels)
len_tmp += len(x)
train_loss_tmp += loss.item()
train_loss_tmp /= len(temp_dataloader)
train_loss += train_loss_tmp
train_acc_list.append(float(train_acc_tmp / len_tmp))
if args.drawn_use and (len(drawn_idx) != 0):
for id in drawn_idx:
df_tmp = df_mission2[df_mission2['instance_id']==id]
ds = LSTMDataset(data=df_tmp, window_size=args.window_size, input_vars=clean_vars,scaler=scaler_drawn)
temp_dataloader = DataLoader(ds, batch_size=args.batch_size, shuffle=False)
train_loss_tmp = 0.0
train_acc_tmp = 0
len_tmp = 0
for batch in temp_dataloader:
x, y = batch
noised_x = add_noise(x)
pred = model(noised_x)
loss = criterion(pred, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
_, pred_labels = torch.max(pred, dim=1)
_, true_labels = torch.max(y, dim=1)
train_acc_tmp += torch.sum(pred_labels == true_labels)
len_tmp += len(x)
train_loss_tmp += loss.item()
train_loss_tmp /= len(temp_dataloader)
train_loss += train_loss_tmp
train_acc_list.append(float(train_acc_tmp / len_tmp))
scheduler.step()
train_acc = np.average(train_acc_list)
train_loss /= train_num
model.eval()
valid_loss = 0.0
valid_acc_list = []
for id in valid_id:
df_tmp = df_mission2[df_mission2['instance_id']==id]
ds = LSTMDataset(data=df_tmp, window_size=args.window_size, input_vars=clean_vars,scaler=scaler)
temp_dataloader = DataLoader(ds, batch_size=args.batch_size, shuffle=False)
valid_loss_tmp = 0.0
valid_acc_tmp = 0
len_tmp = 0
for batch in temp_dataloader:
x, y = batch
pred = model(x)
loss = criterion(pred, y)
_, pred_labels = torch.max(pred, dim=1)
_, true_labels = torch.max(y, dim=1)
valid_acc_tmp += torch.sum(pred_labels == true_labels)
len_tmp += len(x)
valid_loss_tmp += loss.item()
valid_loss_tmp /= len(temp_dataloader)
valid_loss += valid_loss_tmp
valid_acc_list.append(float(valid_acc_tmp / len_tmp))
valid_acc = np.average(valid_acc_list)
valid_loss /= len(valid_id)
if min_valid_loss > valid_loss:
min_valid_loss = valid_loss
if not os.path.exists('./cache'):
os.mkdir('./cache')
torch.save(model.state_dict(), f'./cache/mdl.pth')
logger['t_loss'].append(train_loss)
logger['v_loss'].append(valid_loss)
logger['t_acc'].append(train_acc)
logger['v_acc'].append(valid_acc)
if epoch % 5 == 0:
print("Epoch: %d, train loss: %1.5f, valid loss: %1.5f, train acc: %1.3f, valid acc: %1.3f" % (epoch, train_loss, valid_loss,train_acc, valid_acc))
model.load_state_dict(torch.load(f'./cache/mdl.pth'))
plt.plot(logger['t_loss'])
plt.plot(logger['v_loss'])
plt.plot(logger['t_acc'])
plt.plot(logger['v_acc'])
predictions = []
reals = []
model.eval()
test_acc = []
for id in test_id:
df_tmp = df_mission2[df_mission2['instance_id']==id]
ds = LSTMDataset(data=df_tmp, window_size=args.window_size, input_vars=clean_vars,scaler=scaler)
temp_dataloader = DataLoader(ds, batch_size=args.batch_size, shuffle=False)
test_acc_temp = 0
len_temp = 0
for batch in temp_dataloader:
x, y = batch
pred = model(x)
_, pred_labels = torch.max(pred, dim=1)
_, true_labels = torch.max(y, dim=1)
test_acc_temp += torch.sum(pred_labels == true_labels)
len_temp += len(x)
predictions.extend(pred.detach().numpy())
reals.extend(y.detach().numpy())
test_acc.append(float(test_acc_temp / len_temp))
print(np.average(test_acc))
def inference(id):
predictions = []
reals = []
model.eval()
df_tmp = df_mission2[df_mission2['instance_id']==id]
ds = LSTMDataset(data=df_tmp, window_size=args.window_size, input_vars=clean_vars,scaler=scaler)
temp_dataloader = DataLoader(ds, batch_size=args.batch_size, shuffle=False)
test_acc_temp = 0
len_temp = 0
for batch in temp_dataloader:
x, y = batch
pred = model(x)
_, pred_labels = torch.max(pred, dim=1)
_, true_labels = torch.max(y, dim=1)
test_acc_temp += torch.sum(pred_labels == true_labels)
len_temp += len(x)
predictions.extend(pred.detach().numpy())
reals.extend(y.detach().numpy())
test_acc = float(test_acc_temp / len_temp)
print(test_acc)
rs = []
ps = []
for p in reals:
rs.append(np.where(p == np.max(p))[-1][0])
for p in predictions:
ps.append(np.where(p == np.max(p))[-1][0])
plt.plot(rs, label='real')
plt.plot(ps, label='pred')
plt.legend()
inference(test_id[1])
inference(train_id[8])