# Dataframe
import pandas as pd
import numpy as np
import sys
# Visualization
import matplotlib.pylab as plt
import seaborn as sns
# Gradient boosting
import lightgbm as lgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
# Sklearn
from sklearn.model_selection import GroupKFold, train_test_split, StratifiedKFold
from sklearn.metrics import (
roc_auc_score,
classification_report,
confusion_matrix,
accuracy_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.decomposition import PCA
## Oversampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
## Kaggler
import kaggler
from kaggler.model import AutoLGB
from kaggler.preprocessing import DAE, TargetEncoder, LabelEncoder
## CTGAN
from ctgan import CTGANSynthesizer
## TensorFlow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
## LightAutoML
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler
## EVALML
from evalml.automl import AutoMLSearch
## FLAML
from flaml import AutoML
import torch
# Supressing Warnings
import warnings
warnings.filterwarnings("ignore")
INPUT_DIR = "Data"
bcell = pd.read_csv(f"{INPUT_DIR}/input_bcell.csv")
covid = pd.read_csv(f"{INPUT_DIR}/input_covid.csv")
sars = pd.read_csv(f"{INPUT_DIR}/input_sars.csv")
bcell_sars = pd.concat([bcell, sars], axis=0, ignore_index=True)
bcell_sars.head()
bcell_sars.target.value_counts()
idx_train = bcell_sars["target"].astype("bool").values
fig, axes = plt.subplots(2, 2, figsize=(16, 8))
axes = [x for a in axes for x in a]
for i, name in enumerate(["chou_fasman", "emini", "kolaskar_tongaonkar", "parker"]):
value = bcell_sars[name]
sns.distplot(value[~idx_train], ax=axes[i])
sns.distplot(value[idx_train], ax=axes[i])
axes[i].set_xlabel(name, fontsize=12)
fig.legend(labels=["target 0", "target 1"], loc="right", fontsize=12)
clf = PCA(n_components=2)
z = clf.fit_transform(
bcell_sars[["chou_fasman", "emini", "kolaskar_tongaonkar", "parker"]]
)
plt.figure(figsize=(8, 6))
plt.scatter(*z[idx_train].T, s=3)
plt.scatter(*z[~idx_train].T, s=3)
plt.legend(labels=["target_1", "target_0"], fontsize=12)
plt.show()
idx_train = bcell_sars["target"].astype("bool").values
fig, axes = plt.subplots(2, 2, figsize=(16, 8))
axes = [x for a in axes for x in a]
for i, name in enumerate(
["isoelectric_point", "aromaticity", "hydrophobicity", "stability"]
):
value = bcell_sars[name]
sns.distplot(value[~idx_train], ax=axes[i])
sns.distplot(value[idx_train], ax=axes[i])
axes[i].set_xlabel(name, fontsize=12)
fig.legend(labels=["target 0", "target 1"], loc="right", fontsize=12)
clf = PCA(n_components=2)
z = clf.fit_transform(
bcell_sars[["isoelectric_point", "aromaticity", "hydrophobicity", "stability"]]
)
plt.figure(figsize=(8, 6))
plt.scatter(*z[idx_train].T, s=3)
plt.scatter(*z[~idx_train].T, s=3)
plt.legend(labels=["target_1", "target_0"], fontsize=12)
plt.show()
# create length columns
for df in [bcell, sars, covid, bcell_sars]:
df["length"] = df["end_position"] - df["start_position"] + 1
fig, ax = plt.subplots(figsize=(12, 6))
sns.countplot(bcell_sars["length"], ax=ax, color="lightblue")
sns.countplot(bcell_sars.query("target == 1")["length"], ax=ax, color="coral")
plt.legend(labels=["target 0", "target 1"], fontsize=12)
plt.show()
# Corelation Matrix
corr_matrix = bcell_sars[
[
"parent_protein_id",
"protein_seq",
"start_position",
"end_position",
"peptide_seq",
"chou_fasman",
"emini",
"kolaskar_tongaonkar",
"parker",
"isoelectric_point",
"aromaticity",
"hydrophobicity",
"stability",
"target",
]
].corr()
mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
#corr heatmap
sns.set_style('whitegrid')
f, ax = plt.subplots(figsize=(11, 15))
heatmap = sns.heatmap(corr_matrix,
mask = mask,
square = True,
linewidths = .5,
cmap = 'coolwarm',
cbar_kws = {'shrink': .4,
'ticks' : [-1, -.5, 0, 0.5, 1]},
vmin = -1,
vmax = 1,
annot = False,
annot_kws = {'size': 12})
#add the column names as labels
ax.set_yticklabels(corr_matrix.columns, rotation = 0)
ax.set_xticklabels(corr_matrix.columns)
sns.set_style({'xtick.bottom': True}, {'ytick.left': True})
X = bcell_sars.drop(
["target", "parent_protein_id", "protein_seq", "peptide_seq"], axis=1
)
y = bcell_sars["target"]
X_train, X_valid, Y_train, Y_valid = train_test_split(
X, y, test_size=0.2, random_state=0
)
d = MinMaxScaler()
d.fit_transform(X_train, Y_train)
d.transform(X_valid)
l = LGBMClassifier(random_state=10)
l.fit(X_train, Y_train)
lg_pred = l.predict(X_valid)
print("AUC score :", roc_auc_score(lg_pred, Y_valid))
cat = CatBoostClassifier(random_state=10, verbose=False)
cat.fit(X_train, Y_train)
cat_pred = cat.predict(X_valid)
print("AUC score :", roc_auc_score(cat_pred, Y_valid))
XG = XGBClassifier(random_state=10)
XG.fit(X_train, Y_train)
XG_pred = XG.predict(X_valid)
print("AUC score :", roc_auc_score(XG_pred, Y_valid))
RF = RandomForestClassifier(random_state=10)
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_valid)
print("AUC score :", roc_auc_score(RF_pred, Y_valid))
NB = GaussianNB()
NB.fit(X_train, Y_train)
NB_pred = NB.predict(X_valid)
print("AUC score :", roc_auc_score(NB_pred, Y_valid))
# calculating feature importance
forest_clf = ExtraTreesClassifier(n_estimators=250, random_state=420)
forest_clf.fit(X, y)
imp_features = forest_clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest_clf.estimators_], axis=0)
plt.figure(figsize=(15, 8))
plt.bar(X.columns, std, color="black")
plt.xlabel("Feature Labels")
plt.ylabel("Feature Importances")
plt.title("Comparison of different Feature Importances")
plt.show()
feature_importance = pd.DataFrame()
feature_importance["importance"] = [
268,
154,
204,
221,
180,
244,
335,
355,
354,
374,
311,
]
feature_importance["feature"] = [
"start_position",
"end_position",
"chou_fasman",
"emini",
"kolaskar_tongaonkar",
"parker",
"isoelectric_point",
"aromaticity",
"hydrophobicity",
"stability",
"length",
]
sns.barplot(
x="importance",
y="feature",
data=feature_importance.sort_values(by="importance", ascending=False),
)
plt.title("LGBM Features")
# train_test split
X = bcell_sars.drop(
["target", "parent_protein_id", "protein_seq", "peptide_seq"], axis=1
)
y = bcell_sars["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
model = Sequential()
model.add(Dense(units=128, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(units=64, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(units=32, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(units=16, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation="sigmoid"))
model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()],
)
# Early stopping
early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=25)
model.fit(x=X_train,
y=y_train,
epochs=150,
validation_data=(X_test, y_test), verbose=1,
callbacks=[early_stop]
);
model_loss = pd.DataFrame(model.history.history)
model_loss.plot()
#predictions
predictions = model.predict_classes(X_test)
print("AUC Score:",roc_auc_score(y_test,model.predict(X_test)))
print("Accuracy Score :",accuracy_score(y_test,predictions))
print(
classification_report(
y_test, predictions, target_names=["Covid_Negative", "Covid_Positive"]
)
)
#confusion matrix
plt.figure(figsize = (10,10))
cm = confusion_matrix(y_test,predictions)
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Covid_Negative','Covid_Positive'] , yticklabels = ['Covid_Negative','Covid_Positive'])
plt.xlabel("Predicted")
plt.ylabel("Actual");
X = bcell_sars.drop(
["target", "parent_protein_id", "protein_seq", "peptide_seq"], axis=1
)
y = bcell_sars["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
N_THREADS = 4 # threads cnt for lgbm and linear models
JOB = 1
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 300 # Time in seconds for automl run
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)
def AUC_metric(y_true, y_pred, **kwargs):
return roc_auc_score(y_true, (y_pred > 0.5).astype(int), **kwargs)
task = Task("binary", metric=roc_auc_score)
roles = {
"target": "target",
}
df = bcell_sars.drop(["parent_protein_id", "protein_seq", "peptide_seq"], axis=1)
train_df, test_df = train_test_split(df, test_size=0.20)
%%time
automl = TabularUtilizedAutoML(task = task,
timeout = TIMEOUT,
cpu_limit = N_THREADS,
general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
reader_params = {'n_jobs': JOB})
oof_pred = automl.fit_predict(train_df, roles = roles)
pred = automl.predict(X_test)
prediction = (pred.data[:, 0] > 0.5).astype(int)
print("AUC score :", roc_auc_score(y_test, prediction))
%%time
automl = AutoMLSearch(
X_train=X_train,
y_train=y_train,
problem_type='binary',
# random_seed=2021,
max_time=300,
)
automl.search()
automl.rankings
%%time
pipeline = automl.best_pipeline
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
print("AUC score:",roc_auc_score(y_test,preds))
# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
"time_budget": 300, # in seconds
"metric": "roc_auc",
"task": "classification",
}
# Train with labeled input data
automl.fit(
X_train=X_train,
y_train=y_train,
**automl_settings
)
automl.best_estimator
print("AUC Score:",roc_auc_score(y_test,automl.predict(X_test)))
os = RandomOverSampler(0.95, random_state=10)
print(X.shape, y.shape)
X, y = os.fit_resample(X, y)
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
model1 = CatBoostClassifier(
iterations=250,
od_type="Iter",
l2_leaf_reg=5,
learning_rate=0.95,
verbose=0,
depth=10,
)
model2 = RandomForestClassifier(n_estimators=400, random_state=1)
model3 = GaussianNB()
model4 = LGBMClassifier(
learning_rate=0.1,
n_estimators=1000,
num_leaves=120,
n_jobs=4,
min_child_samples=14,
min_child_weight=10,
)
model = VotingClassifier(
estimators=[("cat", model1), ("RF", model2), ("NB", model3), ("LGBM", model4)],
voting="hard",
)
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Check the ROC score of the model
print("AUC score:", roc_auc_score(y_test, y_pred))
print("Accuracy Score:",accuracy_score(y_test,y_pred))
plt.figure(figsize=(10, 10))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(
cm,
cmap="Blues",
linecolor="black",
linewidth=1,
annot=True,
fmt="",
xticklabels=["Covid_Negative", "Covid_Positive"],
yticklabels=["Covid_Negative", "Covid_Positive"],
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
data = bcell_sars.drop(['parent_protein_id', 'protein_seq', 'peptide_seq'], axis = 1)
data.dropna(inplace=True)
ctgan = CTGANSynthesizer()
# ctgan.fit(data)
loaded = ctgan.load("Covid_model_CTGAN_synth.pkl")
# ctgan.save("Covid_model_CTGAN.pkl")
samples = loaded.sample(20000)
X = samples.drop(['target'], axis = 1)
y = samples['target']
y.value_counts()
os = RandomOverSampler(0.95, random_state=10)
print(X.shape, y.shape)
X, y = os.fit_resample(X, y)
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
y_test.value_counts()
model1 = CatBoostClassifier(
iterations=250,
od_type="Iter",
l2_leaf_reg=5,
learning_rate=0.95,
verbose=0,
depth=10,
)
model2 = RandomForestClassifier(n_estimators=400, random_state=1)
model3 = GaussianNB()
model4 = LGBMClassifier(
learning_rate=0.1,
n_estimators=1000,
num_leaves=120,
n_jobs=4,
min_child_samples=14,
min_child_weight=10,
)
model = VotingClassifier(
estimators=[("cat", model1), ("RF", model2), ("NB", model3), ("LGBM", model4)],
voting="hard",
)
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Check the ROC score of the model
print("AUC Score:", roc_auc_score(y_test, y_pred))
print( "Accuracy Score:",accuracy_score(y_test,y_pred))
# confusion matrix
plt.figure(figsize=(10, 10))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(
cm,
cmap="Blues",
linecolor="black",
linewidth=1,
annot=True,
fmt="",
xticklabels=["Covid_Negative", "Covid_Positive"],
yticklabels=["Covid_Negative", "Covid_Positive"],
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
feature_name = "dae_te"
algo_name = "lgb"
model_name = f"{algo_name}_{feature_name}"
trn, tst = train_test_split(data, test_size=0.2)
n_trn = trn.shape[0]
target_col = "target"
num_cols = [
"start_position",
"end_position",
"chou_fasman",
"emini",
"kolaskar_tongaonkar",
"parker",
"isoelectric_point",
"aromaticity",
"hydrophobicity",
"stability",
]
n_fold = 10
seed = 42
encoding_dim = 256
dae = DAE( num_cols=num_cols, encoding_dim=encoding_dim)
X = dae.fit_transform(data[num_cols])
df_dae = pd.DataFrame(X, columns=[f'dae_{i}' for i in range(encoding_dim)])
print(df_dae.shape)
X = pd.concat([data[num_cols], df_dae], axis=1)
y = data[target_col]
os = RandomOverSampler(0.95, random_state=10)
print(X.shape, y.shape)
X, y = os.fit_resample(X, y)
print(X.shape, y.shape)
df = pd.concat([X, y], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
%%time
model1 = CatBoostClassifier(iterations=250,od_type="Iter",l2_leaf_reg=5,
learning_rate=0.95,verbose=0,
depth=10)
model2 = RandomForestClassifier(n_estimators=400, random_state=1)
model3 = GaussianNB()
model4= LGBMClassifier(learning_rate=0.1,n_estimators=1000,num_leaves=120,n_jobs =4,min_child_samples= 14,
min_child_weight= 10)
model = VotingClassifier(estimators=[('cat', model1),('RF',model2),("NB",model3),("LGBM",model4)], voting='hard')
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Check the ROC score of the model
print( "AUC Score:",roc_auc_score(y_test,y_pred))
print("Accuracy Score:",accuracy_score(y_test,y_pred))
plt.figure(figsize=(10, 10))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(
cm,
cmap="Blues",
linecolor="black",
linewidth=1,
annot=True,
fmt="",
xticklabels=["Covid_Negative", "Covid_Positive"],
yticklabels=["Covid_Negative", "Covid_Positive"],
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
# !pip install black
!black Vaccine_Development_ML.ipynb