import pandas as pd
data = pd.read_csv('transaction_data.csv')
print(data)
Email Domain Country Card Type Defaulted
0 gmail USA VISA 0
1 gmail UK MC 0
2 yahoo USA AMEX 1
3 gmail USA Discover 0
4 gmail USA MC 0
5 yahoo India MC 0
6 other Russia VISA 1
7 gmail USA MC 0
8 other India MC 0
9 yahoo Russia VISA 1
10 gmail USA MC 0
11 gmail India VISA 1
12 gmail UK MC 0
13 yahoo USA Discover 0
14 gmail USA VISA 0
15 yahoo USA MC 0
16 other USA AMEX 1
17 gmail USA MC 0
18 gmail UK VISA 0
19 other USA MC 0
raw_features = data[['Email Domain', 'Country', 'Card Type']]
target = data['Defaulted']
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
features = pd.DataFrame(encoder.fit_transform(raw_features))
features.columns = encoder.get_feature_names(['Email Domain', 'Country', 'Card Type'])
print(features.head())
Email Domain_gmail Email Domain_other Email Domain_yahoo Country_India \
0 1.0 0.0 0.0 0.0
1 1.0 0.0 0.0 0.0
2 0.0 0.0 1.0 0.0
3 1.0 0.0 0.0 0.0
4 1.0 0.0 0.0 0.0
Country_Russia Country_UK Country_USA Card Type_AMEX \
0 0.0 0.0 1.0 0.0
1 0.0 1.0 0.0 0.0
2 0.0 0.0 1.0 1.0
3 0.0 0.0 1.0 0.0
4 0.0 0.0 1.0 0.0
Card Type_Discover Card Type_MC Card Type_VISA
0 0.0 0.0 1.0
1 0.0 1.0 0.0
2 0.0 0.0 0.0
3 1.0 0.0 0.0
4 0.0 1.0 0.0
from sklearn.tree import DecisionTreeClassifier, export_graphviz
model = DecisionTreeClassifier()
model.fit(features, target)
predictions = model.predict(features)
from sklearn.metrics import accuracy_score, confusion_matrix
print("Accuracy: ", accuracy_score(target, predictions))
print("--------")
print("Confusion Matrix")
print(pd.DataFrame(confusion_matrix(target, predictions)))
Accuracy: 1.0
--------
Confusion Matrix
0 1
0 15 0
1 0 5
export_graphviz(
model,
out_file='transaction_tree_1.dot',
feature_names=features.columns,
class_names=['Paid', 'Defaulted'],
rounded=True,
filled=True
)
# Must have graphviz installed for this
from subprocess import check_call
import os
check_call(['dot','-Tpng','transaction_tree_1.dot','-o','transaction_tree_1.png'])
os.remove('transaction_tree_1.dot')
from IPython.display import Image
Image(filename='transaction_tree.png')
model2 = DecisionTreeClassifier(min_samples_leaf=2)
model2.fit(features, target)
predictions = model2.predict(features)
print("Accuracy: ", accuracy_score(target, predictions))
print("--------")
print("Confusion Matrix")
print(pd.DataFrame(confusion_matrix(target, predictions)))
Accuracy: 0.95
--------
Confusion Matrix
0 1
0 15 0
1 1 4
export_graphviz(
model2,
out_file='transaction_tree_2.dot',
feature_names=features.columns,
class_names=['Paid', 'Defaulted'],
rounded=True,
filled=True
)
# Must have graphviz installed for this
from subprocess import check_call
import os
check_call(['dot','-Tpng','transaction_tree_2.dot','-o','transaction_tree_2.png'])
os.remove('transaction_tree_2.dot')
from IPython.display import display, Image
display(Image(filename='transaction_tree_2.png'))
for name, score in zip(features.columns, model2.feature_importances_):
print(name, score)
Email Domain_gmail 0.0
Email Domain_other 0.0
Email Domain_yahoo 0.0
Country_India 0.08783783783783784
Country_Russia 0.4054054054054054
Country_UK 0.0
Country_USA 0.0
Card Type_AMEX 0.5067567567567567
Card Type_Discover 0.0
Card Type_MC 0.0
Card Type_VISA 0.0
test_customer = {
'Gmail': 1,
'Yahoo': 0,
'Other': 0,
'India': 0,
'Russia': 0,
'UK': 0,
'USA': 1,
'AmEx': 1,
'Discover': 0,
'MasterCard': 0,
'VISA': 0,
}
test_customer_values = list(test_customer.values())
test_prediction = model2.predict([test_customer_values])
print("Test Prediction: ", test_prediction[0])
Test Prediction: 1