!ls /datasets/dataset-soal-covid-19
!pip install openpyxl
!pip install statsmodels
'Daily Update Data Agregat Covid-19 Jakarta.xlsx'
Collecting openpyxl
Downloading openpyxl-3.0.7-py2.py3-none-any.whl (243 kB)
|████████████████████████████████| 243 kB 13.8 MB/s
Collecting et-xmlfile
Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.7
WARNING: You are using pip version 21.1.2; however, version 21.1.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
Collecting statsmodels
Downloading statsmodels-0.12.2-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)
|████████████████████████████████| 9.5 MB 28.7 MB/s
Requirement already satisfied: pandas>=0.21 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Requirement already satisfied: numpy>=1.15 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: scipy>=1.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.0)
Collecting patsy>=0.5
Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
|████████████████████████████████| 231 kB 45.3 MB/s
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.21->statsmodels) (2.8.1)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.21->statsmodels) (2021.1)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5->statsmodels) (1.16.0)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.12.2
WARNING: You are using pip version 21.1.2; however, version 21.1.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
data = pd.read_excel("/datasets/dataset-soal-covid-19/Daily Update Data Agregat Covid-19 Jakarta.xlsx")
data
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 29 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Tanggal 497 non-null datetime64[ns]
1 Jam 478 non-null float64
2 Tanggal Jam 497 non-null datetime64[ns]
3 Total Pasien 497 non-null int64
4 Sembuh 497 non-null int64
5 Meninggal 497 non-null int64
6 Self Isolation 497 non-null int64
7 Masih Perawatan 497 non-null int64
8 Belum Diketahui (masih verifikasi) 0 non-null float64
9 Menunggu Hasil 6 non-null float64
10 Tenaga Kesehatan Terinfeksi 6 non-null float64
11 Positif Harian 497 non-null int64
12 Positif Aktif 497 non-null int64
13 Sembuh Harian 497 non-null int64
14 Tanpa Gejala 357 non-null float64
15 Bergejala 357 non-null float64
16 Belum Ada Data 357 non-null float64
17 Unnamed: 17 0 non-null float64
18 Unnamed: 18 0 non-null float64
19 Unnamed: 19 0 non-null float64
20 Unnamed: 20 0 non-null float64
21 Unnamed: 21 0 non-null float64
22 Unnamed: 22 0 non-null float64
23 Unnamed: 23 0 non-null float64
24 Unnamed: 24 0 non-null float64
25 Unnamed: 25 0 non-null float64
26 Unnamed: 26 0 non-null float64
27 Unnamed: 27 0 non-null float64
28 Unnamed: 28 0 non-null float64
dtypes: datetime64[ns](2), float64(19), int64(8)
memory usage: 112.7 KB
data.describe()
rata = data["Positif Harian"].mean()
median = data["Positif Harian"].median()
modus = data["Positif Harian"].value_counts().index[0]
print("Nilai mean dari Positif Harian adalah " + str(rata))
print("Nilai median dari Positif Harian adalah " + str(median))
print("Nilai modus dari Positif Harian adalah " + str(modus))
Nilai mean dari Positif Harian adalah 1306.456740442656
Nilai median dari Positif Harian adalah 867.0
Nilai modus dari Positif Harian adalah 0
maks_positif = data["Positif Harian"].max()
min_positif = data["Positif Harian"].min()
print("Nilai Maksimum Positif Harian: " + str(maks_positif))
print("Nilai Minimum Positif Harian: " + str(min_positif))
Nilai Maksimum Positif Harian: 13112
Nilai Minimum Positif Harian: 0
visualisasi_data = sns.boxplot(x = data["Self Isolation"])
metode_korelasi = ['pearson', 'spearman', 'kendall']
kolom1 = "Sembuh"
kolom2 = "Meninggal"
print("Korelasi dengan tiga metode: ")
for i in metode_korelasi:
print("Metode " + i + " : " + str(data[kolom1].corr(data[kolom2],method = i)))
# Kayak gini juga sama aja kalau gak pake for, pake for lebih singkat aja :D
data[kolom1].corr(data[kolom2], method = "pearson")
data[kolom1].corr(data[kolom2], method = "spearman")
data[kolom1].corr(data[kolom2], method = "kendall")
Korelasi dengan tiga metode:
Metode pearson : 0.9964712547307534
Metode spearman : 0.9999811830150749
Metode kendall : 0.9994967336144513
for i in range(2,5):
# i values change from 0 to 5
# the first value of i is 2
f"Unnamed: {i}"
"Unnamed: " + str(i)
["Unnamed :17","Unnamed: 18"."Unnamed: 19"]
drop_column =["Unnammed: "]
drop_column = [f"Unnamed: {i}" for i in range(17,29)]
drop_column.append("Belum Diketahui (masih verifikasi)")
# append = add item to list (for this case, i append string "Belum Diketahui (masih veri")
data_clean = data.drop(columns = drop_column)
data_clean
data_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Tanggal 497 non-null datetime64[ns]
1 Jam 478 non-null float64
2 Tanggal Jam 497 non-null datetime64[ns]
3 Total Pasien 497 non-null int64
4 Sembuh 497 non-null int64
5 Meninggal 497 non-null int64
6 Self Isolation 497 non-null int64
7 Masih Perawatan 497 non-null int64
8 Menunggu Hasil 6 non-null float64
9 Tenaga Kesehatan Terinfeksi 6 non-null float64
10 Positif Harian 497 non-null int64
11 Positif Aktif 497 non-null int64
12 Sembuh Harian 497 non-null int64
13 Tanpa Gejala 357 non-null float64
14 Bergejala 357 non-null float64
15 Belum Ada Data 357 non-null float64
dtypes: datetime64[ns](2), float64(6), int64(8)
memory usage: 62.2 KB
data_clean["Jam"] = data_clean["Tanggal Jam"].dt.time
data_clean
data_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Tanggal 497 non-null datetime64[ns]
1 Jam 497 non-null object
2 Tanggal Jam 497 non-null datetime64[ns]
3 Total Pasien 497 non-null int64
4 Sembuh 497 non-null int64
5 Meninggal 497 non-null int64
6 Self Isolation 497 non-null int64
7 Masih Perawatan 497 non-null int64
8 Menunggu Hasil 6 non-null float64
9 Tenaga Kesehatan Terinfeksi 6 non-null float64
10 Positif Harian 497 non-null int64
11 Positif Aktif 497 non-null int64
12 Sembuh Harian 497 non-null int64
13 Tanpa Gejala 357 non-null float64
14 Bergejala 357 non-null float64
15 Belum Ada Data 357 non-null float64
dtypes: datetime64[ns](2), float64(5), int64(8), object(1)
memory usage: 62.2+ KB
sns.lineplot(data=data_clean,x = "Tanggal", y = "Self Isolation")
plt.tight_layout()
print(data_clean["Tanggal Jam"].min())
print(data_clean["Tanggal Jam"].max())
2020-03-01 18:00:00
2021-07-10 08:00:00
data_clean.describe()
data_clean.index = data_clean["Tanggal Jam"]
data_clean
data_clean.index = data_clean["Tanggal Jam"]
data_clean_date = data_clean.drop(["Tanggal","Jam","Tanggal Jam","Menunggu Hasil","Tenaga Kesehatan Terinfeksi"], axis = 1)
data_clean_date
data_clean_date
data_clean_date.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 497 entries, 2020-03-01 18:00:00 to 2021-07-10 08:00:00
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Total Pasien 497 non-null int64
1 Sembuh 497 non-null int64
2 Meninggal 497 non-null int64
3 Self Isolation 497 non-null int64
4 Masih Perawatan 497 non-null int64
5 Menunggu Hasil 6 non-null float64
6 Tenaga Kesehatan Terinfeksi 6 non-null float64
7 Positif Harian 497 non-null int64
8 Positif Aktif 497 non-null int64
9 Sembuh Harian 497 non-null int64
10 Tanpa Gejala 357 non-null float64
11 Bergejala 357 non-null float64
12 Belum Ada Data 357 non-null float64
dtypes: float64(5), int64(8)
memory usage: 54.4 KB
plt.plot(data_clean_date["Positif Harian"])
plt.tight_layout()
plt.plot(data_clean_date["Positif Aktif"])
plt.tight_layout()
plt.plot(data_clean_date["Total Pasien"])
plt.tight_layout()
data_clean_date["Self Isolation"].corr(data_clean_date["Meninggal"])