Tubes_ML - Classification

import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from math import sqrt import numpy as np from scipy import stats import copy

data_train = pd.read_csv('https://github.com/farhanakbar8/classification-car-dealer/blob/main/kendaraan_train.csv?raw=true') data_train

data_train = data_train.drop_duplicates() data_train

data_train = data_train.dropna() data_train

data_train['Jenis_Kelamin'] = data_train['Jenis_Kelamin'].map({'Pria':0, 'Wanita':1}) data_train['Umur_Kendaraan'] = data_train['Umur_Kendaraan'].map({'< 1 Tahun':0, '1-2 Tahun':1, '> 2 Tahun':2}) data_train['Kendaraan_Rusak'] = data_train['Kendaraan_Rusak'].map({'Pernah':1, 'Tidak':0}) data_train

data_train.drop(columns=['id'], inplace=True) data_train

data_train.boxplot()

z_score = np.abs(stats.zscore(data_train)) threshold = 3 data_train = data_train[(z_score < threshold).all(axis = 1)] data_train

data_test = pd.read_csv('https://github.com/farhanakbar8/classification-car-dealer/blob/main/kendaraan_test.csv?raw=true') data_test

data_test['Jenis_Kelamin'] = data_test['Jenis_Kelamin'].map({'Pria':0, 'Wanita':1}) data_test['Umur_Kendaraan'] = data_test['Umur_Kendaraan'].map({'< 1 Tahun':0, '1-2 Tahun':1, '> 2 Tahun':2}) data_test['Kendaraan_Rusak'] = data_test['Kendaraan_Rusak'].map({'Pernah':1, 'Tidak':0}) data_test

data_train_X = data_train[['Jenis_Kelamin', 'Umur', 'Umur_Kendaraan', 'Premi']][:10000] data_train_X

data_train_Y = data_train[['Tertarik']][:10000] data_train_Y

data_test_X = data_test[['Jenis_Kelamin', 'Umur', 'Umur_Kendaraan', 'Premi']][:10000] data_test_X

data_test_Y = data_test[['Tertarik']][:10000] data_test_Y

corr_mat = data_train_X.corr() fig, ax = plt.subplots(figsize=(15, 15)) sns.heatmap(corr_mat, annot=True, ax=ax, fmt='.1g', cmap='coolwarm', linewidths=.5, vmin=-0.6, vmax=1, center= 0) plt.show()

data_train_X_np = data_train_X.to_numpy() data_train_Y_np = data_train_Y.to_numpy() data_test_X_np = data_test_X.to_numpy() data_test_Y_np = data_test_Y.to_numpy()

data_train_X_flatten = data_train_X_np.reshape(data_train_X_np.shape[0], -1).T data_test_X_flatten = data_test_X_np.reshape(data_test_X_np.shape[0], -1).T

def sigmoid(z): return 1 / (1 + np.exp(-z))

def initialize_with_zeros(d): w = np.zeros((d, 1)) b = float(0) return w, b

def propagate(w, b, X, Y): m = X.shape[1] A = sigmoid(np.dot(w.T, X) + b) cost = (- 1 / m) * np.sum(Y * np.log(A) + (1 - Y) * (np.log(1 - A))) dw = (1 / m) * np.dot(X, (A - Y).T) db = (1 / m) * np.sum(A - Y) cost = np.squeeze(np.array(cost)) grads = {"dw": dw, "db": db} return grads, cost

def optimize(w, b, X, Y, num_iterations=100, learning_rate=0.009, print_cost=False): w = copy.deepcopy(w) b = copy.deepcopy(b) costs = [] for i in range(num_iterations): grads, cost = propagate(w, b, X, Y) # Retrieve derivatives from grads dw = grads["dw"] db = grads["db"] # update rule w = w - learning_rate * dw b = b - learning_rate * db # Record the costs if i % 100 == 0: costs.append(cost) # Print the cost every 100 training iterations if print_cost: print ("Cost after iteration %i: %f" %(i, cost)) params = {"w": w, "b": b} grads = {"dw": dw, "db": db} return params, grads, costs

def predict(w, b, X): m = X.shape[1] Y_prediction = np.zeros((1, m)) # w = w.reshape(X.shape[0], 1) # Compute vector "A" predicting the probabilities of A = sigmoid(np.dot(w.T, X) + b) for i in range(A.shape[1]): Y_prediction[0, i] = 1 if A[0, i] > 0.5 else 0 return Y_prediction

# GRADED FUNCTION: model def model(X_train, Y_train, X_test, Y_test, num_iterations=2000, learning_rate=0.5, print_cost=False): """ Builds the logistic regression model by calling the function you've implemented previously Arguments: X_train -- training set represented by a numpy array of shape (num_px * num_px * 3, m_train) Y_train -- training labels represented by a numpy array (vector) of shape (1, m_train) X_test -- test set represented by a numpy array of shape (num_px * num_px * 3, m_test) Y_test -- test labels represented by a numpy array (vector) of shape (1, m_test) num_iterations -- hyperparameter representing the number of iterations to optimize the parameters learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize() print_cost -- Set to True to print the cost every 100 iterations Returns: d -- dictionary containing information about the model. """ # (≈ 1 line of code) # initialize parameters with zeros # w, b = ... w,b = initialize_with_zeros(X_train.shape[0]) #(≈ 1 line of code) # Gradient descent # params, grads, costs = ... params, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost) # Retrieve parameters w and b from dictionary "params" # w = ... # b = ... w = params["w"] b = params["b"] # Predict test/train set examples (≈ 2 lines of code) # Y_prediction_test = ... # Y_prediction_train = ... # YOUR CODE STARTS HERE Y_prediction_test = predict(w, b, X_test) Y_prediction_train = predict(w, b, X_train) # YOUR CODE ENDS HERE # Print train/test Errors if print_cost: print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100)) print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100)) d = {"costs": costs, "Y_prediction_test": Y_prediction_test, "Y_prediction_train" : Y_prediction_train, "w" : w, "b" : b, "learning_rate" : learning_rate, "num_iterations": num_iterations} return d

logistic_regression_model = model(data_train_X_flatten, data_train_Y_np, data_test_X_flatten, data_test_Y_np, num_iterations=2000, learning_rate=0.005, print_cost=True)

logistic_regression_model