import math
import sys
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
sns.set_style('whitegrid')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')
print(f'Python version: {sys.version}')
print(f'pandas version: {pd.__version__}')
print(f'numpy version: {np.__version__}')
print(f'seaborn version: {sns.__version__}')
pd.Timestamp.now()
# pd.Timestamp.now().strftime('%Y-%m-%d')
from datetime import datetime
import socket # pip install socket
print(f'last run: {datetime.now()}')
try:
print(f'host info: {socket.gethostbyname_ex(socket.gethostname())}')
except:
pass
data_url='https://raw.githubusercontent.com/GuySuphakit/WHO-COVID-19-Analysis/main/WHO%20COVID-19%20global%20table%20data%20June%2027th%202021%20at%205.02.05%20AM.csv'
df=pd.read_csv(data_url)
df[:5]
def df_unique_value(df):
for c in df.columns:
col_type = df[c].dtype
if col_type == 'object' or col_type.name == 'category':
print(f'{c:10}\n{df[c].unique()}')
print('-' * 65)
def convert_cols_to_category(df, cols:list):
"""
convert `cols` to `category`
"""
for c in df[cols]:
df[c] = df[c].astype('category')
return df
def convert_obj_columns_to_category(df):
for c in df.columns:
col_type = df[c].dtype
if col_type == 'object' or col_type.name == 'category':
df[c] = df[c].astype('category')
return df
def print_category_columns(df):
for c in df.columns:
col_type = df[c].dtype
if col_type.name == 'category':
# print(f'{c}: {df[c].cat.categories}')
# print(pd.Series(df[c].cat.categories))
print(f'{c:15}: {list(enumerate(df[c].cat.categories))}')
print('-' * 60)
def plot_mn(df, cols, n_rows:int=1, kind:str='boxplot', color='salmon'):
"""
plot boxplot, violin, hist in m (rows) by n (columns)
>>> plot_mn(df, ['Calories', 'Fat'], 2, 'hist')
"""
n=len(cols)
n_cols=math.ceil(n / n_rows)
fig, ax = plt.subplots(n_rows, n_cols, figsize=(n_cols*3, n_rows*3.5))
ax=ax.ravel()
fig.tight_layout()
for i, c in enumerate(cols):
col_type = df[c].dtype
if col_type.name == 'category':
sns.countplot(data=df, x=c, ax=ax[i])
else:
if kind.lower()=='boxplot':
sns.boxplot(data=df[[c]], ax=ax[i], color=color)
if kind.lower()=='boxen':
sns.boxenplot(data=df[[c]], ax=ax[i], color=color)
elif kind.lower()=='violin':
sns.violinplot(data=df[[c]], ax=ax[i], color=color)
elif kind.lower()=='hist':
sns.distplot(df[c], hist=True, kde=False, ax=ax[i], color=color)
df.drop_duplicates(inplace=True)
df.info()
df.head()
df.isna().sum()
# ตามหาข้อมูลที่เป็น unique value ของแต่ละคอลัมน์
df_unique_value(df)
df[df['Cases - cumulative total per 100000 population'].isna()|df['WHO Region'].isna()]
df[df['Transmission Classification'] == 'No cases'].head()
df[df['Transmission Classification'] == 'Not applicable'].head() #ไม่สามารถนำไปปรับใช้ได้
# เปลี่ยน data type ที่เป็น Object ให้เป็น category เพื่อให้สามารถทำ countplot ได้
df=convert_obj_columns_to_category(df)
df.drop(index=[0,201], axis=0,inplace=True) # ลบแถวของ Global กับ others ออก
df.drop(index=df[df['Transmission Classification'] == 'No cases'].index, axis=0) #ลบ Transmission Classification ที่เป็น No cases ออก
df.columns
cols=['Name', 'WHO Region', 'Cases - cumulative total',
'Cases - cumulative total per 100000 population',
'Cases - newly reported in last 7 days',
'Cases - newly reported in last 7 days per 100000 population',
'Cases - newly reported in last 24 hours', 'Deaths - cumulative total',
'Deaths - cumulative total per 100000 population',
'Deaths - newly reported in last 7 days',
'Deaths - newly reported in last 7 days per 100000 population',
'Deaths - newly reported in last 24 hours',
'Transmission Classification']
#plot คร่าวๆทุก feature
plot_mn(df, df.columns, 3, 'hist')
# plt.figure(figsize=(15,8))
# sns.countplot(data=df, x='WHO Region', order= df['WHO Region'].value_counts().index)
temp_df = df.groupby(by="WHO Region", as_index=False).agg(counts=pd.NamedAgg(column="WHO Region", aggfunc="count")).sort_values(by="counts", ascending=False)
px.bar(data_frame=temp_df,
x = 'WHO Region',
y = 'counts',
color = 'WHO Region',
color_discrete_sequence = px.colors.qualitative.Prism,
title = 'WHO Region Countplot')
# plt.figure(figsize=(15,8))
# sns.countplot(data=df, x='Transmission Classification',order=df['Transmission Classification'].value_counts().index)
temp_df = df.groupby(by="Transmission Classification", as_index=False).agg(counts=pd.NamedAgg(column="Transmission Classification", aggfunc="count")).sort_values(by="counts", ascending=False)
px.bar(data_frame = temp_df,
x = 'Transmission Classification',
y = 'counts',
color = 'Transmission Classification',
color_discrete_sequence = px.colors.qualitative.Vivid,
title = 'Transmission Classification Countplot')
px.histogram(data_frame = df,
x = "Cases - cumulative total",
marginal = "box",
color_discrete_sequence = px.colors.qualitative.D3,
title = 'Cases - cumulative total Countplot')
px.histogram(data_frame = df,
x = "Cases - cumulative total per 100000 population",
marginal = "box" ,
color_discrete_sequence = px.colors.qualitative.D3,
title = 'Cases - cumulative total per 100000 population Countplot')
px.histogram(data_frame = df,
x = "Deaths - cumulative total",
marginal = "box",
color_discrete_sequence = px.colors.qualitative.Set1,
title = 'Deaths - cumulative total')
px.histogram(data_frame = df,
x = "Deaths - cumulative total per 100000 population",
marginal = "box",
color_discrete_sequence = px.colors.qualitative.Set1,
title = 'Deaths - cumulative total per 100000 population Countplot')
px.density_heatmap(data_frame = df,
x = "Cases - cumulative total per 100000 population",
y = "Deaths - cumulative total per 100000 population",
color_continuous_scale = "BuGn",
title = 'Cases and Deaths total per 100000 population Countplot')
px.bar(data_frame = df,
y = df.nlargest(10, 'Cases - cumulative total')['Cases - cumulative total'],
x = df.nlargest(10, 'Cases - cumulative total')['Name'],
color = df.nlargest(10, 'Cases - cumulative total')['Cases - cumulative total'],
color_continuous_scale = "PuBu",
title = 'Top 10 ประเทศที่มีจำนวนเคสผู้ติดเชื้อรวมมากที่สุด')
px.bar(data_frame = df,
y = df.nlargest(10, 'Deaths - cumulative total')['Deaths - cumulative total'],
x = df.nlargest(10, 'Deaths - cumulative total')['Name'],
color = df.nlargest(10, 'Deaths - cumulative total')['Deaths - cumulative total'],
color_continuous_scale = "Reds",
title = 'Top 10 ประเทศที่มีจำนวนผู้เสียชีวิตรวมมากที่สุด')
dcorr=df[cols].corr()
# dcorr
mask = np.zeros_like(dcorr)
# mask.shape
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots(figsize=(7,5))
sns.heatmap(dcorr, cmap=sns.diverging_palette(10, 145, n=100),
vmin=-1, vmax=1, center=0, linewidths=1, annot=True, mask=mask, ax=ax).set_title("Correlation\nHeatmap", fontsize=22,fontweight="bold");
cols2=['Cases - cumulative total per 100000 population',
'Deaths - cumulative total per 100000 population']
df[cols2].head()
X=df[cols2]
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
!pip install yellowbrick
from yellowbrick.cluster import SilhouetteVisualizer
def sil_score(X, from_k=2, to_k=6):
'''
calculate silhouette score for k clusters
'''
sils=[]
for k in range(from_k, to_k + 1):
m = KMeans(n_clusters=k)
m.fit(X)
silhouette_avg = silhouette_score(X, m.labels_).round(4)
sils.append([silhouette_avg, k])
return sils
ss=sil_score(X, 2, 6)
print(f'scores = {ss}')
print(f'optimal number of clusters = {max(ss)[1]}')
def silhouette_plot(X, from_k, to_k):
sil_scores=[]
for k in range(from_k, to_k + 1):
# Instantiate the clustering model and visualizer
m = KMeans(n_clusters=k)
visualizer = SilhouetteVisualizer(m)
visualizer.fit(X) # Fit the training data to the visualizer
visualizer.poof() # Draw/show/poof the data
# print(visualizer.silhouette_score_)
# sil_scores.append([k, visualizer.silhouette_score_])
sil_scores.append([visualizer.silhouette_score_, k])
return sil_scores
scores=silhouette_plot(X, 2, 6)
model = KMeans(n_clusters=3)
model.fit(X)
model.cluster_centers_.round(3)
model.transform(X) # the distance between each point and centroid
X['cluster'] = model.labels_
X
# ตารางค่าเชิงสถิติ
X.groupby('cluster').describe().T
# boxen plot
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 9))
ax=ax.ravel()
for i, col in enumerate(cols2):
sns.boxenplot(x='cluster', y=col, data=X, ax=ax[i])
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 9))
sns.heatmap(X.iloc[:,[0,2]].groupby('cluster').median().sort_values(by='cluster'), cmap="Blues", linewidths=1,ax=ax[0]) # Case cumulative per 100000 population Heatmap
sns.heatmap(X.iloc[:,[1,2]].groupby('cluster').median().sort_values(by='cluster'), cmap="Reds", linewidths=1,ax=ax[1]) # Death cumulative per 100000 population Heatmap
fig.show()
df2=X
df2
!pip install lightgbm
import lightgbm as lgb
print(f'lightgbm version {lgb.__version__}')
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, ShuffleSplit, cross_val_score, cross_val_predict, cross_validate
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, precision_recall_fscore_support, f1_score
df2.columns
feature_cols=['Cases - cumulative total per 100000 population','Deaths - cumulative total per 100000 population']
target_col='cluster'
X=df2[feature_cols]
y=df2[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.20,
random_state=1,
stratify=y)
clf=lgb.LGBMClassifier()
clf.get_params()
# tuning parameters: https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html
# https://sites.google.com/view/lauraepp/parameters
params={'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 1.0,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': -1,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 100,
'n_jobs': -1,
'num_leaves': 31,
'objective': None,
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 1.0,
'subsample_for_bin': 200000,
'subsample_freq': 0}
clf = lgb.LGBMClassifier(**params)
# eval_metric: https://lightgbm.readthedocs.io/en/latest/Parameters.html?highlight=metric#metric-parameters
fit_params={'early_stopping_rounds':20,
'eval_metric': 'binary_logloss', #['binary_logloss', 'auc']
'eval_set': [(X_test, y_test)],
'verbose': 10
}
clf.fit(X_train, y_train, **fit_params)
clf.score(X_train, y_train) # classification accuracy score
clf.score(X_test, y_test) # classification accuracy score
cm=confusion_matrix(y_test, clf.predict(X_test))
cm
plt.rcParams['font.size']=15
plt.rcParams['font.family']='Tahoma'
plot_confusion_matrix(clf, X_test, y_test, cmap='Greens', display_labels=['เสี่ยงปานกลาง', 'เสี่ยงน้อย','เสี่ยงมาก']);
y_test.value_counts()
print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))