WHO-COVID-19-Analysis

import math import sys import pandas as pd import seaborn as sns import numpy as np import matplotlib.pyplot as plt import plotly.express as px sns.set_style('whitegrid') %matplotlib inline %config InlineBackend.figure_format = 'retina'

import warnings warnings.filterwarnings('ignore')

print(f'Python version: {sys.version}') print(f'pandas version: {pd.__version__}') print(f'numpy version: {np.__version__}') print(f'seaborn version: {sns.__version__}') pd.Timestamp.now() # pd.Timestamp.now().strftime('%Y-%m-%d')

from datetime import datetime import socket # pip install socket print(f'last run: {datetime.now()}') try: print(f'host info: {socket.gethostbyname_ex(socket.gethostname())}') except: pass data_url='https://raw.githubusercontent.com/GuySuphakit/WHO-COVID-19-Analysis/main/WHO%20COVID-19%20global%20table%20data%20June%2027th%202021%20at%205.02.05%20AM.csv' df=pd.read_csv(data_url) df[:5]

def df_unique_value(df): for c in df.columns: col_type = df[c].dtype if col_type == 'object' or col_type.name == 'category': print(f'{c:10}\n{df[c].unique()}') print('-' * 65) def convert_cols_to_category(df, cols:list): """ convert `cols` to `category` """ for c in df[cols]: df[c] = df[c].astype('category') return df def convert_obj_columns_to_category(df): for c in df.columns: col_type = df[c].dtype if col_type == 'object' or col_type.name == 'category': df[c] = df[c].astype('category') return df def print_category_columns(df): for c in df.columns: col_type = df[c].dtype if col_type.name == 'category': # print(f'{c}: {df[c].cat.categories}') # print(pd.Series(df[c].cat.categories)) print(f'{c:15}: {list(enumerate(df[c].cat.categories))}') print('-' * 60) def plot_mn(df, cols, n_rows:int=1, kind:str='boxplot', color='salmon'): """ plot boxplot, violin, hist in m (rows) by n (columns) >>> plot_mn(df, ['Calories', 'Fat'], 2, 'hist') """ n=len(cols) n_cols=math.ceil(n / n_rows) fig, ax = plt.subplots(n_rows, n_cols, figsize=(n_cols*3, n_rows*3.5)) ax=ax.ravel() fig.tight_layout() for i, c in enumerate(cols): col_type = df[c].dtype if col_type.name == 'category': sns.countplot(data=df, x=c, ax=ax[i]) else: if kind.lower()=='boxplot': sns.boxplot(data=df[[c]], ax=ax[i], color=color) if kind.lower()=='boxen': sns.boxenplot(data=df[[c]], ax=ax[i], color=color) elif kind.lower()=='violin': sns.violinplot(data=df[[c]], ax=ax[i], color=color) elif kind.lower()=='hist': sns.distplot(df[c], hist=True, kde=False, ax=ax[i], color=color)

df.drop_duplicates(inplace=True) df.info()

df.head()

df.isna().sum()

# ตามหาข้อมูลที่เป็น unique value ของแต่ละคอลัมน์ df_unique_value(df)

df[df['Cases - cumulative total per 100000 population'].isna()|df['WHO Region'].isna()]

df[df['Transmission Classification'] == 'No cases'].head()

df[df['Transmission Classification'] == 'Not applicable'].head() #ไม่สามารถนำไปปรับใช้ได้

# เปลี่ยน data type ที่เป็น Object ให้เป็น category เพื่อให้สามารถทำ countplot ได้ df=convert_obj_columns_to_category(df)

df.drop(index=[0,201], axis=0,inplace=True) # ลบแถวของ Global กับ others ออก df.drop(index=df[df['Transmission Classification'] == 'No cases'].index, axis=0) #ลบ Transmission Classification ที่เป็น No cases ออก

df.columns

cols=['Name', 'WHO Region', 'Cases - cumulative total', 'Cases - cumulative total per 100000 population', 'Cases - newly reported in last 7 days', 'Cases - newly reported in last 7 days per 100000 population', 'Cases - newly reported in last 24 hours', 'Deaths - cumulative total', 'Deaths - cumulative total per 100000 population', 'Deaths - newly reported in last 7 days', 'Deaths - newly reported in last 7 days per 100000 population', 'Deaths - newly reported in last 24 hours', 'Transmission Classification']

#plot คร่าวๆทุก feature plot_mn(df, df.columns, 3, 'hist')

# plt.figure(figsize=(15,8)) # sns.countplot(data=df, x='WHO Region', order= df['WHO Region'].value_counts().index) temp_df = df.groupby(by="WHO Region", as_index=False).agg(counts=pd.NamedAgg(column="WHO Region", aggfunc="count")).sort_values(by="counts", ascending=False) px.bar(data_frame=temp_df, x = 'WHO Region', y = 'counts', color = 'WHO Region', color_discrete_sequence = px.colors.qualitative.Prism, title = 'WHO Region Countplot')

# plt.figure(figsize=(15,8)) # sns.countplot(data=df, x='Transmission Classification',order=df['Transmission Classification'].value_counts().index) temp_df = df.groupby(by="Transmission Classification", as_index=False).agg(counts=pd.NamedAgg(column="Transmission Classification", aggfunc="count")).sort_values(by="counts", ascending=False) px.bar(data_frame = temp_df, x = 'Transmission Classification', y = 'counts', color = 'Transmission Classification', color_discrete_sequence = px.colors.qualitative.Vivid, title = 'Transmission Classification Countplot')

px.histogram(data_frame = df, x = "Cases - cumulative total", marginal = "box", color_discrete_sequence = px.colors.qualitative.D3, title = 'Cases - cumulative total Countplot')

px.histogram(data_frame = df, x = "Cases - cumulative total per 100000 population", marginal = "box" , color_discrete_sequence = px.colors.qualitative.D3, title = 'Cases - cumulative total per 100000 population Countplot')

px.histogram(data_frame = df, x = "Deaths - cumulative total", marginal = "box", color_discrete_sequence = px.colors.qualitative.Set1, title = 'Deaths - cumulative total')

px.histogram(data_frame = df, x = "Deaths - cumulative total per 100000 population", marginal = "box", color_discrete_sequence = px.colors.qualitative.Set1, title = 'Deaths - cumulative total per 100000 population Countplot')

px.density_heatmap(data_frame = df, x = "Cases - cumulative total per 100000 population", y = "Deaths - cumulative total per 100000 population", color_continuous_scale = "BuGn", title = 'Cases and Deaths total per 100000 population Countplot')

px.bar(data_frame = df, y = df.nlargest(10, 'Cases - cumulative total')['Cases - cumulative total'], x = df.nlargest(10, 'Cases - cumulative total')['Name'], color = df.nlargest(10, 'Cases - cumulative total')['Cases - cumulative total'], color_continuous_scale = "PuBu", title = 'Top 10 ประเทศที่มีจำนวนเคสผู้ติดเชื้อรวมมากที่สุด')

px.bar(data_frame = df, y = df.nlargest(10, 'Deaths - cumulative total')['Deaths - cumulative total'], x = df.nlargest(10, 'Deaths - cumulative total')['Name'], color = df.nlargest(10, 'Deaths - cumulative total')['Deaths - cumulative total'], color_continuous_scale = "Reds", title = 'Top 10 ประเทศที่มีจำนวนผู้เสียชีวิตรวมมากที่สุด')

dcorr=df[cols].corr() # dcorr mask = np.zeros_like(dcorr) # mask.shape mask[np.triu_indices_from(mask)] = True fig, ax = plt.subplots(figsize=(7,5)) sns.heatmap(dcorr, cmap=sns.diverging_palette(10, 145, n=100), vmin=-1, vmax=1, center=0, linewidths=1, annot=True, mask=mask, ax=ax).set_title("Correlation\nHeatmap", fontsize=22,fontweight="bold");

cols2=['Cases - cumulative total per 100000 population', 'Deaths - cumulative total per 100000 population']

df[cols2].head()

X=df[cols2]

from sklearn.cluster import KMeans from sklearn.metrics import silhouette_samples, silhouette_score !pip install yellowbrick from yellowbrick.cluster import SilhouetteVisualizer

def sil_score(X, from_k=2, to_k=6): ''' calculate silhouette score for k clusters ''' sils=[] for k in range(from_k, to_k + 1): m = KMeans(n_clusters=k) m.fit(X) silhouette_avg = silhouette_score(X, m.labels_).round(4) sils.append([silhouette_avg, k]) return sils

ss=sil_score(X, 2, 6) print(f'scores = {ss}') print(f'optimal number of clusters = {max(ss)[1]}')

def silhouette_plot(X, from_k, to_k): sil_scores=[] for k in range(from_k, to_k + 1): # Instantiate the clustering model and visualizer m = KMeans(n_clusters=k) visualizer = SilhouetteVisualizer(m) visualizer.fit(X) # Fit the training data to the visualizer visualizer.poof() # Draw/show/poof the data # print(visualizer.silhouette_score_) # sil_scores.append([k, visualizer.silhouette_score_]) sil_scores.append([visualizer.silhouette_score_, k]) return sil_scores

scores=silhouette_plot(X, 2, 6)

model = KMeans(n_clusters=3) model.fit(X)

model.cluster_centers_.round(3)

model.transform(X) # the distance between each point and centroid

X['cluster'] = model.labels_ X

# ตารางค่าเชิงสถิติ X.groupby('cluster').describe().T

# boxen plot fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 9)) ax=ax.ravel() for i, col in enumerate(cols2): sns.boxenplot(x='cluster', y=col, data=X, ax=ax[i])

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 9)) sns.heatmap(X.iloc[:,[0,2]].groupby('cluster').median().sort_values(by='cluster'), cmap="Blues", linewidths=1,ax=ax[0]) # Case cumulative per 100000 population Heatmap sns.heatmap(X.iloc[:,[1,2]].groupby('cluster').median().sort_values(by='cluster'), cmap="Reds", linewidths=1,ax=ax[1]) # Death cumulative per 100000 population Heatmap fig.show()

df2=X df2

!pip install lightgbm import lightgbm as lgb print(f'lightgbm version {lgb.__version__}')

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, ShuffleSplit, cross_val_score, cross_val_predict, cross_validate from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, precision_recall_fscore_support, f1_score

df2.columns

feature_cols=['Cases - cumulative total per 100000 population','Deaths - cumulative total per 100000 population'] target_col='cluster' X=df2[feature_cols] y=df2[target_col] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1, stratify=y)

clf=lgb.LGBMClassifier()

clf.get_params()

# tuning parameters: https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html # https://sites.google.com/view/lauraepp/parameters params={'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0} clf = lgb.LGBMClassifier(**params)

# eval_metric: https://lightgbm.readthedocs.io/en/latest/Parameters.html?highlight=metric#metric-parameters fit_params={'early_stopping_rounds':20, 'eval_metric': 'binary_logloss', #['binary_logloss', 'auc'] 'eval_set': [(X_test, y_test)], 'verbose': 10 } clf.fit(X_train, y_train, **fit_params)

clf.score(X_train, y_train) # classification accuracy score

clf.score(X_test, y_test) # classification accuracy score

cm=confusion_matrix(y_test, clf.predict(X_test)) cm

plt.rcParams['font.size']=15 plt.rcParams['font.family']='Tahoma' plot_confusion_matrix(clf, X_test, y_test, cmap='Greens', display_labels=['เสี่ยงปานกลาง', 'เสี่ยงน้อย','เสี่ยงมาก']);

y_test.value_counts()

print(classification_report(y_train, clf.predict(X_train)))

print(classification_report(y_test, clf.predict(X_test)))