Naive-Bayes-Hotel-Price-Classement-Estimation/model

import pandas as pd

data = pd.read_csv('data/hotel_preprocess_v2.csv') data

data.describe()

data_fill = data[['Area', 'AirConditioner', 'Bar', 'Pool', 'Spa', 'FitenessCenter', 'Restaurant', 'HotTub', 'Price_IDR', 'hotelClass']]

data2 = data_fill[['Price_IDR', 'hotelClass']] data2

df = data2.loc[data2['hotelClass'] == 2] rslt_df = df.sort_values(by=['Price_IDR']) # remove rows using the drop() function data_fill.drop(data_fill[(data_fill['Price_IDR'] > 580_000) & (data_fill['hotelClass'] == 3)].index, inplace=True) rslt_df

df = data2.loc[data2['hotelClass'] == 3] rslt_df = df.sort_values(by=['Price_IDR']) threshold_price3 = (rslt_df.values[121])[0] # remove rows using the drop() function data_fill.drop(data_fill[(data_fill['Price_IDR'] < threshold_price3) & (data_fill['hotelClass'] == 3)].index, inplace=True) print(f' Kuartil 1: {threshold_price3}') data_fill

df = data2.loc[data2['hotelClass'] == 4] rslt_df = df.sort_values(by=['Price_IDR']) threshold_price4 = (rslt_df.values[54])[0] # remove rows using the drop() function data_fill.drop(data_fill[(data_fill['Price_IDR'] < threshold_price4) & (data_fill['hotelClass'] == 4)].index, inplace=True) print(f' Kuartil 1: {threshold_price4}') data_fill

df = data2.loc[data2['hotelClass'] == 5] rslt_df = df.sort_values(by=['Price_IDR']) threshold_price5 = (rslt_df.values[11])[0] # remove rows using the drop() function data_fill.drop(data_fill[(data_fill['Price_IDR'] < threshold_price5) & (data_fill['hotelClass'] == 5)].index, inplace=True) # filter data outlier # print(f' Kuartil 1: {threshold_price5[0]}') data_fill

data_fill

data_list = data_fill.values # dari pandas dataframe ke numpy array data_list[:10]

# Area # 0 = Jakarta, 1 = Surabaya, 2 = Jogjakarta, 3 = Bandung dom = { 'Jakarta': 0, 'Surabaya': 1, 'Jogjakarta': 2, 'Bandung': 3, } for data in data_list: data[0] = dom[data[0]] data_list

# Air conditioner # 0 = False, 1 = True for data in data_list: data[1] = 0 if not data[1] else 1 data_list

# Bar # 0 = False, 1 = True for data in data_list: data[2] = 0 if not data[2] else 1 data_list

# Pool # 0 = False, 1 = True for data in data_list: data[3] = 0 if not data[3] else 1 data_list

# Spa # 0 = False, 1 = True for data in data_list: data[4] = 0 if not data[4] else 1 data_list

# FitenessCenter # 0 = False, 1 = True for data in data_list: data[5] = 0 if not data[5] else 1 data_list

# Restaurant # 0 = False, 1 = True for data in data_list: data[6] = 0 if not data[6] else 1 data_list

#Hot Tub # 0 = False, 1 = True for data in data_list: data[7] = 0 if not data[7] else 1 data_list

# Price IDR for i in data_list: if i[8] < 200_000: i[8]= 'Kelas Harga 1 : < 200_000' elif 200_000 <= i[8] < 450_000: i[8]= 'Kelas Harga 2 : 200.000 - 450.000' elif 450_000 <= i[8] < 700_000: i[8]= 'Kelas Harga 3 : 450.000 - 700.000' else: i[8]= 'Kelas Harga 4 : >= 700.000' data_list[:10]

#Hotel Class dom = { 2: 0, 3: 1, 4: 2, 5: 3, } for data in data_list: data[9] = dom[data[9]] data_list

x = data_list[:, [0, 1, 2, 3, 4, 5, 6, 7, 9]] y = data_list[:, -2]

len(x), len(y)

from sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.25)

data_fill

X_train.dtype, y_train.dtype # print(type(X_train))

from sklearn.naive_bayes import CategoricalNB clf = CategoricalNB() clf.fit(X_train, y_train)

clf.score(X_val, y_val)

X_val, y_val

pred_data = clf.predict(X_val) pred_data

from sklearn.metrics import confusion_matrix, classification_report from seaborn import heatmap cm = confusion_matrix(pred_data, y_val) heatmap(cm, annot=True)

print(classification_report(pred_data, y_val))

import numpy as np data_pred = np.array([[2, 1, 1, 1, 1, 1, 1, 1, 3]]) print(data_pred) clf.predict(data_pred)