import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None # suppress warning; default='warn'
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
patients = pd.read_csv('PatientInfo.csv')
patients.head()
data = patients.drop(columns=['patient_id', 'global_num', 'birth_year', 'age', 'disease', 'infected_by'])
print("Entries with no age:", len(patients[patients['age'].isnull()]))
print("Entries with no birth year:", len(patients[patients['birth_year'].isnull()]))
Entries with no age: 105
Entries with no birth year: 464
date_by_age = patients['age'].fillna('0').str.extract(r'(\d+)').astype(int)
date_by_birth_year = 2020-patients['birth_year'].fillna(2020)
data['age'] = pd.concat([date_by_age, date_by_birth_year], axis=1).max(axis=1)
print("Entries with no age or birth year:", len(data[data['age'].isnull()]))
data.head(5)
Entries with no age or birth year: 0
plt.title("Age Distribution of South Korean COVID-19 Cases")
ax1 = sns.distplot(data['age'], rug=True, bins=10)
print("Entries with no infection order:", len(data[data['infection_order'].isnull()]))
print("Entries with no symptom onset date:", len(data[data['symptom_onset_date'].isnull()]))
print("Entries with no confirmed date:", len(data[data['confirmed_date'].isnull()])) #should be 0
print("Entries with a contact value:", len(data[data['contact_number'].notnull()]))
print("Entries with an infection order:", len(data[data['infection_order'].notnull()]))
Entries with no infection order: 3097
Entries with no symptom onset date: 2682
Entries with no confirmed date: 0
Entries with a contact value: 589
Entries with an infection order: 31
data['symptom_onset_date'] = pd.to_datetime(data['symptom_onset_date'].replace(" ", ""),
format='%Y-%m-%d')
data['confirmed_date'] = pd.to_datetime(data['confirmed_date'].replace(" ", ""),
format='%Y-%m-%d')
data['released_date'] = pd.to_datetime(data['released_date'].replace(" ", ""),
format='%Y-%m-%d')
data['time_before_hospitalization'] = data['confirmed_date'] - data['symptom_onset_date']
data['time_in_hospital'] = data['released_date'] - data['confirmed_date'] #death or otherwise
data.head(5)
cases_by_day = data.groupby('confirmed_date').count()[['sex']].reset_index().rename(columns={'sex':'count'})
cases_by_day['cumulative_sum'] = cases_by_day['count'].cumsum()
cases_by_day
ax7 = sns.lineplot(x='confirmed_date', y='cumulative_sum', data=cases_by_day)
suppressor = ax7.set_yscale('log')
print("Average time to confirm after symptom onset:", np.mean(data['time_before_hospitalization']))
print("Average time to release after confirmed:", np.mean(data['time_in_hospital']))
Average time to confirm after symptom onset: 4 days 07:03:54.606741
Average time to release after confirmed: 20 days 05:47:53.394495
#time in hospital by infection type
print("Entries with no infection case and no hospitalization time:",
sum(data.dropna(subset=['time_in_hospital'])['infection_case'].isnull()))
Entries with no infection case and no hospitalization time: 454
temp_data = data.dropna(subset=['infection_case', 'time_in_hospital'])
temp_data['time_in_hospital'] = temp_data['time_in_hospital'].apply(pd.Timedelta.total_seconds) / (24*60*60)
plt.figure(figsize=(15,12))
plt.title("Hospitalization Time Distribution by Infection Case")
ax2 = sns.violinplot(x='infection_case', y='time_in_hospital',
data=temp_data)
ax2.axhline(np.median(temp_data['time_in_hospital']), ls='--')
suppressor = ax2.set_xticklabels(ax2.get_xticklabels(), rotation=90)
from statsmodels.stats.weightstats import ztest
t, p = ztest(x1=temp_data[temp_data['infection_case']=='Gyeongsan Jeil Silver Town']['time_in_hospital'],
value=np.mean(temp_data['time_in_hospital']))
print("P-value:", p)
print("T-statistic:", t)
P-value: 2.2730538888858298e-08
T-statistic: 5.589819789370226
data['dead'] = (data['state'].factorize()[0] == 2).astype(int) #0: alive; 1: dead
data.head(5)
plt.figure(figsize=(10, 8))
plt.title("Mortality by Age")
ax3 = sns.regplot(x='age', y='dead', data=data, y_jitter=0.01, logistic=True, truncate=True,
scatter_kws={'s':5, 'alpha':1}, line_kws={'color':'red'})
model = data.copy()
model['bias'] = 1.0 #intercept term
model['time_in_hospital'] = model['time_in_hospital'].\
fillna(np.mean(data['time_in_hospital'])).\
apply(pd.Timedelta.total_seconds) / (24*60*60)
model['sex'] = (model['sex'].factorize()[0]).astype(int) #one-hot encode sex
model.head()
def vectorize(t, features, y):
return t[features].values.T, t[y].values
features = ['bias', 'age', 'sex', 'time_in_hospital']
X_train, y_train = vectorize(t=model, features=features, y='dead')
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(fit_intercept=False, C=1e9, solver='lbfgs')
lr.fit(X_train.T, y_train)
for i in range(len(features)):
print(features[i] + " coefficient: " + str(lr.coef_[0][i]))
bias coefficient: -8.392651804457564
age coefficient: 0.09238093162215619
sex coefficient: -1.4829568336690335
time_in_hospital coefficient: -0.02228844707944757
def sigma(t):
return 1 / (1 + np.e**(-t))
age_bins = np.linspace(np.min(model['age']), np.max(model['age']), 21)
averages = [np.average(model[np.abs(model['age']-a)<3]['dead']) for a in age_bins]
plt.figure(figsize=(10, 8))
plt.title("Predicted Mortality by Age")
ax4 = sns.scatterplot(x='age', y='dead', data=model)
plt.scatter(age_bins, averages, label = 'binned means')
plt.plot(age_bins, sigma(lr.coef_[0][0] + age_bins * lr.coef_[0][1]), color='r', label = 'logistic model')
suppressor = plt.legend()
import sklearn.model_selection
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X_train.T, y_train.T, test_size=0.2)
X_val, X_test, y_val, y_test = sklearn.model_selection.train_test_split(X_val, y_val, test_size=0.5)
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape
from keras.models import Sequential
from keras.layers import Dense
model = Sequential([Dense(4, activation='relu', input_shape=(4,)),
Dense(4, activation='relu'),
Dense(1, activation='sigmoid'),])
model.compile(optimizer='sgd',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(X_train, y_train,
batch_size=1, epochs=5,
validation_data=(X_val, y_val))
nn_predictions = model.predict_classes(X_test, verbose=1).flatten()
test_accuracy = model.evaluate(X_test, y_test)[1]
print(f"Test accuracy: {test_accuracy:.4f}")
Using TensorFlow backend.
Train on 2502 samples, validate on 313 samples
Epoch 1/5
2502/2502 [==============================] - 3s 1ms/step - loss: 0.1417 - accuracy: 0.9784 - val_loss: 0.0768 - val_accuracy: 0.9840
Epoch 2/5
2502/2502 [==============================] - 6s 2ms/step - loss: 0.1076 - accuracy: 0.9796 - val_loss: 0.0707 - val_accuracy: 0.9840
Epoch 3/5
2502/2502 [==============================] - 4s 2ms/step - loss: 0.1020 - accuracy: 0.9796 - val_loss: 0.0840 - val_accuracy: 0.9840
Epoch 4/5
2502/2502 [==============================] - 2s 991us/step - loss: 0.0968 - accuracy: 0.9796 - val_loss: 0.0647 - val_accuracy: 0.9840
Epoch 5/5
2502/2502 [==============================] - 3s 1ms/step - loss: 0.0924 - accuracy: 0.9796 - val_loss: 0.0662 - val_accuracy: 0.9840
313/313 [==============================] - 0s 54us/step
313/313 [==============================] - 0s 45us/step
Test accuracy: 0.9840
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.hlines(test_accuracy, 0, 4, colors='r', linestyles='dashed')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation', 'Test'], loc='left')
def predict(X, betas):
return sigma(X.T @ betas)
def classify(preds, threshold = 0.5):
return np.int64(preds >= threshold)
def accuracy(actual, pred):
return np.mean(actual == pred)
def precision(actual, pred):
tp = sum((actual == pred) & (actual == 1))
tn = sum((actual == pred) & (actual == 0))
fp = sum((actual != pred) & (actual == 0))
fn = sum((actual != pred) & (actual == 1))
return tp / (tp + fp)
def recall(actual, pred):
tp = sum((actual == pred) & (actual == 1))
tn = sum((actual == pred) & (actual == 0))
fp = sum((actual != pred) & (actual == 0))
fn = sum((actual != pred) & (actual == 1))
return tp / (tp + fn)
from itertools import product
prob = []
for a, s, t in product(np.linspace(1, 100, 100),
(0, 1),
np.linspace(1, 40, 40)):
prob.append([a, s+101, t] + [predict(np.array([1, a, s, t]), lr.coef_[0])])
predictions = pd.DataFrame(np.array(prob)).\
rename(columns={0: 'age', 1: 'sex', 2: 'time in hospital', 3: 'probability'}).\
replace({101: 'male', 102:'female'})
predictions.head()
male = predictions[predictions['sex']=='male'].groupby('age').mean()
female = predictions[predictions['sex']=='female'].groupby('age').mean()
plt.title("Age vs Mortality by Sex")
ax5 = plt.plot(male.index, male['probability'], label='male')
ax5 = plt.plot(female.index, female['probability'], label='female')
suppressor = plt.legend()
hospital = predictions.groupby('time in hospital').mean()
plt.title("Time in Hospital vs Mortality")
ax6 = plt.plot(hospital.index, hospital['probability'])