import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
from IPython.display import Latex
%%latex
from pysurvival.datasets import Dataset
maintenance = Dataset('maintenance').load()
maintenance.head(10)
maintenance[['broken', 'team', 'provider']] = maintenance[['broken', 'team', 'provider']].astype('category')
maintenance.dtypes
categories = ['provider', 'team']
maintenance_dummy = pd.get_dummies(maintenance, columns = categories, drop_first=True)
maintenance_dummy.head()
from sklearn.model_selection import train_test_split
index_train, index_test = train_test_split(range(maintenance_dummy.shape[0]), test_size = 0.2, random_state=SEED)
data_train = maintenance_dummy.loc[index_train].reset_index(drop = True)
data_test = maintenance_dummy.loc[index_test].reset_index(drop = True)
features = np.setdiff1d(maintenance_dummy.columns, ['lifetime','broken']).tolist()
X_train, X_test = data_train[features], data_test[features]
T_train, T_test = data_train['lifetime'].values, data_test['lifetime'].values
E_train, E_test = np.array(data_train['broken'].values), np.array(data_test['broken'].values)
from pysurvival.models.semi_parametric import CoxPHModel
coxph = CoxPHModel()
coxph.fit(X_train, T_train, E_train, lr=0.5, l2_reg=1e-2, init_method='zeros')
Performing Newton-Raphson optimization
* Iteration #1 - Loss = 1588.772 - ||grad||^2 = 319.12143
* Iteration #2 - Loss = 1260.426 - ||grad||^2 = 164.46688
* Iteration #3 - Loss = 1135.155 - ||grad||^2 = 99.67192
* Iteration #4 - Loss = 1053.192 - ||grad||^2 = 61.15221
* Iteration #5 - Loss = 996.584 - ||grad||^2 = 37.31715
* Iteration #6 - Loss = 959.229 - ||grad||^2 = 22.80550
* Iteration #7 - Loss = 935.345 - ||grad||^2 = 14.12548
* Iteration #8 - Loss = 920.224 - ||grad||^2 = 8.92736
* Iteration #9 - Loss = 910.730 - ||grad||^2 = 5.73887
* Iteration #10 - Loss = 904.914 - ||grad||^2 = 3.71561
* Iteration #11 - Loss = 901.465 - ||grad||^2 = 2.40797
* Iteration #12 - Loss = 899.464 - ||grad||^2 = 1.55605
* Iteration #13 - Loss = 898.312 - ||grad||^2 = 0.99669
* Iteration #14 - Loss = 897.650 - ||grad||^2 = 0.62547
* Iteration #15 - Loss = 897.272 - ||grad||^2 = 0.37932
* Iteration #16 - Loss = 897.059 - ||grad||^2 = 0.22022
* Iteration #17 - Loss = 896.941 - ||grad||^2 = 0.12224
* Iteration #18 - Loss = 896.878 - ||grad||^2 = 0.06530
* Iteration #19 - Loss = 896.845 - ||grad||^2 = 0.03393
* Iteration #20 - Loss = 896.828 - ||grad||^2 = 0.01732
* Iteration #21 - Loss = 896.819 - ||grad||^2 = 0.00876
* Iteration #22 - Loss = 896.815 - ||grad||^2 = 0.00440
* Iteration #23 - Loss = 896.812 - ||grad||^2 = 0.00221
* Iteration #24 - Loss = 896.811 - ||grad||^2 = 0.00111
* Iteration #25 - Loss = 896.811 - ||grad||^2 = 0.00055
Converged after 25 iterations.
H0 = "not significant"
H1 = "significant"
alpha = 0.05
coxph_summary = coxph.get_summary()
coxph_summary[f'result (alpha={alpha})'] = coxph_summary['p_values'].astype('float64').apply(lambda x: H1 if x < alpha else H0)
coxph_summary[['variables', 'coef', 'p_values', f'result (alpha={alpha})']]
from pysurvival.utils.metrics import concordance_index
from pysurvival.utils.display import integrated_brier_score, compare_to_actual
def evaluate(model, X, T, E, model_name = ""):
errors = compare_to_actual(model, X, T, E, is_at_risk = False)
c_index = concordance_index(model, X, T, E)
ibs = integrated_brier_score(model, X, T, E)
metrics = {'C-index': c_index, 'IBS': ibs}
eval_df = pd.DataFrame(data={**metrics, **errors}, index=[model_name])
return eval_df.rename(columns={'root_mean_squared_error': 'RMSE',
'median_absolute_error': 'MADE',
'mean_absolute_error': 'MAE'})
eval_coxph = evaluate(coxph, X_test, T_test, E_test, model_name = "Cox PH")
eval_coxph
from pysurvival.models.multi_task import LinearMultiTaskModel
l_mtlr = LinearMultiTaskModel(bins=100)
l_mtlr.fit(X_train, T_train, E_train, lr=1e-3, l2_reg=1e-6, l2_smooth=1e-6,
init_method='orthogonal', optimizer ='adamax', num_epochs=50)
eval_l_mtlr = evaluate(l_mtlr, X_test, T_test, E_test, model_name = "Linear MTLR")
eval_l_mtlr
from pysurvival.models.multi_task import NeuralMultiTaskModel
# Simple structure
structure = [ {'activation': 'ReLU', 'num_units': 100}, ]
# Building the model
n_mtlr = NeuralMultiTaskModel(structure=structure, bins=100)
n_mtlr.fit(X_train, T_train, E_train, lr=1e-3, num_epochs = 500,
l2_reg = 1e-6, l2_smooth=1e-6,
init_method='orthogonal', optimizer = 'rprop')
eval_mtlr_1 = evaluate(n_mtlr, X_test, T_test, E_test, model_name = "Neural MTLR 1-hidden layer")
eval_mtlr_1
from pysurvival.utils.display import display_loss_values
display_loss_values(n_mtlr, figure_size=(7, 4))
eval_all = pd.concat([eval_coxph, eval_l_mtlr, eval_mtlr_1])
eval_all
best_model = n_mtlr
risk_profile = X_test.copy()
risk_profile['risk_score'] = best_model.predict_risk(risk_profile, use_log=True)
risk_profile.head()
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=SEED).fit(risk_profile[['risk_score']])
risk_profile['risk_group'] = kmeans.labels_
risk_group_bound = risk_profile.groupby('risk_group')['risk_score'].min().sort_values().to_frame()
risk_group_bound.index = ['low', 'medium', 'high']
risk_group_bound.columns = ['lower_bound']
risk_group_bound['upper_bound'] = risk_group_bound['lower_bound'].shift(periods=-1).fillna(risk_profile['risk_score'].max())
risk_group_bound['color'] = ['red', 'green', 'blue']
risk_group_bound
from pysurvival.utils.display import create_risk_groups
risk_groups = create_risk_groups(
model=best_model, X=X_test, num_bins=50,
**risk_group_bound.T.to_dict())
fig, axes = plt.subplots(3, 1, figsize=(15, 10))
for i, (ax, (label, (color, idxs))) in enumerate(zip(axes.flat, risk_groups.items())):
X = X_test.values[idxs, :]
T = T_test[idxs]
E = E_test[idxs]
broken = np.argwhere((E == 1)).flatten()
for j in broken:
survival = best_model.predict_survival(X[j, :]).flatten()
ax.plot(best_model.times, survival, color=color)
ax.set_title(f"{label.title()} Risk")
plt.show()
plt.figure(figsize=(15, 5))
for i, (label, (color, idxs)) in enumerate(risk_groups.items()):
record_idx = X_test.iloc[idxs, :].index
X = X_test.values[idxs, :]
T = T_test[idxs]
E = E_test[idxs]
# pilih mesin secara random yang pernah mengalami failure
choices = np.argwhere((E == 1)).flatten()
k = np.random.choice(choices, 1)[0]
# predicted survival function
survival = best_model.predict_survival(X[k, :]).flatten()
plt.plot(best_model.times, survival,
color=color, label=f'Record {record_idx[k]} ({label} risk)')
# actual failure time
actual_t = T[k]
plt.axvline(x=actual_t, color=color, ls='--')
plt.annotate(f'T={actual_t:.1f}',
xy=(actual_t, 0.5*(1+0.2*i)),
xytext=(actual_t, 0.5*(1+0.2*i)),
fontsize=12)
plt.title("Survival Functions Comparison between High, Medium, Low Risk Machine")
plt.legend()
plt.show()