import pandas as pd
data = pd.read_csv('data/hotel_preprocess_v2.csv')
data
data.describe()
data_fill = data[['Area', 'AirConditioner', 'Bar', 'Pool', 'Spa', 'FitenessCenter', 'Restaurant', 'HotTub', 'Price_IDR', 'hotelClass']]
data2 = data_fill[['Price_IDR', 'hotelClass']]
data2
df = data2.loc[data2['hotelClass'] == 2]
rslt_df = df.sort_values(by=['Price_IDR'])
# remove rows using the drop() function
data_fill.drop(data_fill[(data_fill['Price_IDR'] > 580_000) & (data_fill['hotelClass'] == 3)].index, inplace=True)
rslt_df
df = data2.loc[data2['hotelClass'] == 3]
rslt_df = df.sort_values(by=['Price_IDR'])
threshold_price3 = (rslt_df.values[121])[0]
# remove rows using the drop() function
data_fill.drop(data_fill[(data_fill['Price_IDR'] < threshold_price3) & (data_fill['hotelClass'] == 3)].index, inplace=True)
print(f' Kuartil 1: {threshold_price3}')
data_fill
df = data2.loc[data2['hotelClass'] == 4]
rslt_df = df.sort_values(by=['Price_IDR'])
threshold_price4 = (rslt_df.values[54])[0]
# remove rows using the drop() function
data_fill.drop(data_fill[(data_fill['Price_IDR'] < threshold_price4) & (data_fill['hotelClass'] == 4)].index, inplace=True)
print(f' Kuartil 1: {threshold_price4}')
data_fill
df = data2.loc[data2['hotelClass'] == 5]
rslt_df = df.sort_values(by=['Price_IDR'])
threshold_price5 = (rslt_df.values[11])[0]
# remove rows using the drop() function
data_fill.drop(data_fill[(data_fill['Price_IDR'] < threshold_price5) & (data_fill['hotelClass'] == 5)].index, inplace=True)
# filter data outlier
# print(f' Kuartil 1: {threshold_price5[0]}')
data_fill
data_fill
data_list = data_fill.values # dari pandas dataframe ke numpy array
data_list[:10]
# Area
# 0 = Jakarta, 1 = Surabaya, 2 = Jogjakarta, 3 = Bandung
dom = {
'Jakarta': 0,
'Surabaya': 1,
'Jogjakarta': 2,
'Bandung': 3,
}
for data in data_list:
data[0] = dom[data[0]]
data_list
# Air conditioner
# 0 = False, 1 = True
for data in data_list:
data[1] = 0 if not data[1] else 1
data_list
# Bar
# 0 = False, 1 = True
for data in data_list:
data[2] = 0 if not data[2] else 1
data_list
# Pool
# 0 = False, 1 = True
for data in data_list:
data[3] = 0 if not data[3] else 1
data_list
# Spa
# 0 = False, 1 = True
for data in data_list:
data[4] = 0 if not data[4] else 1
data_list
# FitenessCenter
# 0 = False, 1 = True
for data in data_list:
data[5] = 0 if not data[5] else 1
data_list
# Restaurant
# 0 = False, 1 = True
for data in data_list:
data[6] = 0 if not data[6] else 1
data_list
#Hot Tub
# 0 = False, 1 = True
for data in data_list:
data[7] = 0 if not data[7] else 1
data_list
# Price IDR
for i in data_list:
if i[8] < 200_000:
i[8]= 'Kelas Harga 1 : < 200_000'
elif 200_000 <= i[8] < 450_000:
i[8]= 'Kelas Harga 2 : 200.000 - 450.000'
elif 450_000 <= i[8] < 700_000:
i[8]= 'Kelas Harga 3 : 450.000 - 700.000'
else:
i[8]= 'Kelas Harga 4 : >= 700.000'
data_list[:10]
#Hotel Class
dom = {
2: 0,
3: 1,
4: 2,
5: 3,
}
for data in data_list:
data[9] = dom[data[9]]
data_list
x = data_list[:, [0, 1, 2, 3, 4, 5, 6, 7, 9]]
y = data_list[:, -2]
len(x), len(y)
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.25)
data_fill
X_train.dtype, y_train.dtype
# print(type(X_train))
from sklearn.naive_bayes import CategoricalNB
clf = CategoricalNB()
clf.fit(X_train, y_train)
clf.score(X_val, y_val)
X_val, y_val
pred_data = clf.predict(X_val)
pred_data
from sklearn.metrics import confusion_matrix, classification_report
from seaborn import heatmap
cm = confusion_matrix(pred_data, y_val)
heatmap(cm, annot=True)
print(classification_report(pred_data, y_val))
import numpy as np
data_pred = np.array([[2, 1, 1, 1, 1, 1, 1, 1, 3]])
print(data_pred)
clf.predict(data_pred)