# Import pandas
import pandas as pd
# Load dataset
cc_apps = pd.read_csv("/work/datasets/cc_approvals.data",header=None)
# Inspect data
cc_apps.head()
# Print summary statistics
cc_apps_description = cc_apps.describe()
print(cc_apps_description)
print('\n')
# Print DataFrame information
cc_apps_info = cc_apps.info()
print(cc_apps_info)
print('\n')
# Inspect missing values in the dataset
cc_apps.tail(17)
# Import train_test_split
from sklearn.model_selection import train_test_split
# Drop the features 11 and 13
cc_apps = cc_apps.drop([11,13], axis=1)
# Split into train and test sets
cc_apps_train, cc_apps_test = train_test_split(cc_apps, test_size=0.33, random_state=42)
# Import numpy
import numpy as np
# Replace the '?'s with NaN in the train and test sets
cc_apps_train = cc_apps_train.replace("?",np.nan)
cc_apps_test = cc_apps_test.replace("?",np.nan)
# Impute the missing values with mean imputation
cc_apps_train.fillna(cc_apps.mean(), inplace=True)
cc_apps_test.fillna(cc_apps.mean(), inplace=True)
# Count the number of NaNs in the datasets and print the counts to verify
cc_apps.isna().sum().sort_values()
# Iterate over each column of cc_apps_train
for col in cc_apps_train.columns:
# Check if the column is of object type
if cc_apps_train[col].dtypes == 'object':
# Impute with the most frequent value
cc_apps_train = cc_apps_train.fillna(cc_apps_train[col].value_counts().index[0])
cc_apps_test = cc_apps_test.fillna(cc_apps_train[col].value_counts().index[0])
# Count the number of NaNs in the dataset and print the counts to verify
print(cc_apps_train.isnull().sum())
print(cc_apps_test.isnull().sum())
# Convert the categorical features in the train and test sets independently
cc_apps_train = pd.get_dummies(cc_apps_train)
cc_apps_test = pd.get_dummies(cc_apps_test)
# Reindex the columns of the test set aligning with the train set
cc_apps_test = cc_apps_test.reindex(columns=cc_apps_train.columns, fill_value=0)
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
# Segregate features and labels into separate variables
X_train, y_train = cc_apps_train.iloc[:, :-1].values, cc_apps_train.iloc[:, [-1]].values
X_test, y_test = cc_apps_test.iloc[:, :-1].values, cc_apps_test.iloc[:, [-1]].values
# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression
# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()
# Fit logreg to the train set
logreg.fit(rescaledX_train , y_train.ravel())
# Import confusion_matrix
from sklearn.metrics import confusion_matrix
# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)
# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ",logreg.score(rescaledX_test,y_test))
# Print the confusion matrix of the logreg model
confusion_matrix( y_test,y_pred)
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV
# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]
# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)
# Fit grid_model to the data
grid_model_result = grid_model.fit(rescaledX_train , y_train.ravel())
# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))
# Extract the best model and evaluate it on the test set
best_model = grid_model_result.best_estimator_
print("Accuracy of logistic regression classifier: ",best_model.score(rescaledX_test,y_test))