# import libraries in this cell
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
from wordcloud import WordCloud
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36322 entries, 0 to 36321
Data columns (total 26 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 publish_time 36322 non-null object
1 channel_id 36322 non-null object
2 title 36322 non-null object
3 description 35400 non-null object
4 thumbnail_url 0 non-null float64
5 thumbnail_width 0 non-null float64
6 thumbnail_height 0 non-null float64
7 channel_name 36322 non-null object
8 tags 32214 non-null object
9 category_id 36322 non-null int64
10 live_status 36322 non-null object
11 local_title 36322 non-null object
12 local_description 35400 non-null object
13 duration 36322 non-null object
14 dimension 36322 non-null object
15 definition 36322 non-null object
16 caption 36322 non-null bool
17 license_status 36322 non-null bool
18 allowed_region 1412 non-null object
19 blocked_region 1582 non-null object
20 view 36314 non-null float64
21 like 35857 non-null float64
22 dislike 35857 non-null float64
23 favorite 36322 non-null int64
24 comment 36143 non-null float64
25 trending_time 36322 non-null object
dtypes: bool(2), float64(7), int64(2), object(15)
memory usage: 6.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36322 entries, 0 to 36321
Data columns (total 26 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 publish_time 36322 non-null datetime64[ns]
1 channel_id 36322 non-null object
2 title 36322 non-null object
3 description 35400 non-null object
4 thumbnail_url 0 non-null float64
5 thumbnail_width 0 non-null float64
6 thumbnail_height 0 non-null float64
7 channel_name 36322 non-null object
8 tags 32214 non-null object
9 category_id 36322 non-null int64
10 live_status 36322 non-null object
11 local_title 36322 non-null object
12 local_description 35400 non-null object
13 duration 36322 non-null object
14 dimension 36322 non-null object
15 definition 36322 non-null object
16 caption 36322 non-null bool
17 license_status 36322 non-null bool
18 allowed_region 1412 non-null object
19 blocked_region 1582 non-null object
20 view 36314 non-null float64
21 like 35857 non-null float64
22 dislike 35857 non-null float64
23 favorite 36322 non-null int64
24 comment 36143 non-null float64
25 trending_time 36322 non-null datetime64[ns]
dtypes: bool(2), datetime64[ns](2), float64(7), int64(2), object(13)
memory usage: 6.7+ MB
Columns with no missing value: Index(['publish_time', 'channel_id', 'title', 'channel_name', 'category_id',
'live_status', 'local_title', 'duration', 'dimension', 'definition',
'caption', 'license_status', 'favorite', 'trending_time'],
dtype='object')
Columns with missing values: Index(['description', 'thumbnail_url', 'thumbnail_width', 'thumbnail_height',
'tags', 'local_description', 'allowed_region', 'blocked_region', 'view',
'like', 'dislike', 'comment'],
dtype='object')
Presentase missing value pada kolom description adalah: 2.54%
Presentase missing value pada kolom thumbnail_url adalah: 100.0%
Presentase missing value pada kolom thumbnail_width adalah: 100.0%
Presentase missing value pada kolom thumbnail_height adalah: 100.0%
Presentase missing value pada kolom tags adalah: 11.31%
Presentase missing value pada kolom local_description adalah: 2.54%
Presentase missing value pada kolom allowed_region adalah: 96.11%
Presentase missing value pada kolom blocked_region adalah: 95.64%
Presentase missing value pada kolom view adalah: 0.02%
Presentase missing value pada kolom like adalah: 1.28%
Presentase missing value pada kolom dislike adalah: 1.28%
Presentase missing value pada kolom comment adalah: 0.49%
categorical_cols
numerical_cols