import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, precision_score, precision_recall_curve, PrecisionRecallDisplay
transactions = pd.read_csv('transactions.csv', sep = ";")
codes = pd.read_csv('codes.csv', sep = ";")
types = pd.read_csv('types.csv', sep = ";")
train = pd.read_csv('train_set.csv', sep=";")
data = transactions.set_index("client_id").join(train.set_index("client_id")).reset_index().dropna()
data.head()
client_idint64
datetimeobject
0
22899
231 09:31:53
1
22899
349 16:34:52
2
22899
441 15:36:37
3
22899
167 09:50:12
4
22899
221 18:54:19
data.shape
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 91826 entries, 0 to 130023
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 client_id 91826 non-null int64
1 datetime 91826 non-null object
2 code 91826 non-null int64
3 type 91826 non-null int64
4 sum 91826 non-null float64
5 target 91826 non-null float64
dtypes: float64(2), int64(3), object(1)
memory usage: 4.9+ MB
data.isnull().sum()
data.dropna().describe()
client_idfloat64
codefloat64
count
91826
91826
mean
51693681.87
5596.441389
std
28399447.09
602.1448136
min
22899
742
25%
27360511
5251
50%
53883700
5661
75%
75521890
6010
max
99991245
9402
data.dropna(how="any", inplace=True)
data.shape
data.describe()
client_idfloat64
codefloat64
count
91826
91826
mean
51693681.87
5596.441389
std
28399447.09
602.1448136
min
22899
742
25%
27360511
5251
50%
53883700
5661
75%
75521890
6010
max
99991245
9402
# To get dataset of only numbers
data_num = data.select_dtypes(include = ['float64', 'int64'])
data_num.head()
client_idint64
codeint64
0
22899
6011
1
22899
6011
2
22899
6011
3
22899
4814
4
22899
5399
# Get Most Frequent value for all columns
from pandas.api.types import is_categorical_dtype
for col in data.columns:
print(col, end=' - \n')
print('_' * 50)
if col in ['Type description'] or is_categorical_dtype(col):
display(pd.DataFrame(data[col].astype('str').value_counts().sort_values(ascending=False).head(3)))
else:
display(pd.DataFrame(data[col].value_counts().sort_values(ascending=False).head(5)))
client_id -
__________________________________________________
datetime -
__________________________________________________
code -
__________________________________________________
type -
__________________________________________________
sum -
__________________________________________________
target -
__________________________________________________
targetint64
0
46715
1
45111
data_num.hist(figsize=(15, 15), bins=40);
corr = data.corr()
g = sns.heatmap(corr, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, fmt='.2f', cmap='coolwarm')
sns.despine()
g.figure.set_size_inches(14,10)
plt.show()
z = data[data['sum']<0]
print(z)
client_id datetime code type sum target
0 22899 231 09:31:53 6011 2110 -6737.75 1.0
1 22899 349 16:34:52 6011 4010 -8759.07 1.0
2 22899 441 15:36:37 6011 4010 -8759.07 1.0
3 22899 167 09:50:12 4814 1030 -1122.96 1.0
4 22899 221 18:54:19 5399 1110 -4626.59 1.0
... ... ... ... ... ... ...
129994 99911226 259 15:16:17 5411 1010 -26629.82 0.0
130014 99967537 332 09:55:60 6011 2010 -336887.37 1.0
130019 99985917 61 11:42:26 6011 2010 -224591.58 0.0
130020 99991245 375 06:38:58 4829 2330 -67377.47 1.0
130021 99991245 292 11:20:02 5499 1010 -16574.86 1.0
[73733 rows x 6 columns]
z.shape[0]
y = data[data['sum']>0]
print(y)
client_id datetime code type sum target
6 22899 352 10:45:57 6010 7071 17967.33 1.0
7 22899 61 20:53:04 6010 7030 22459.16 1.0
8 22899 344 11:45:23 6011 7010 44918.32 1.0
9 27914 292 10:50:34 6011 7010 11229.58 1.0
11 27914 141 13:20:55 6010 7020 67377.47 1.0
... ... ... ... ... ... ...
129946 99849205 409 14:44:35 6011 7010 24705.07 1.0
129971 99882949 393 08:59:16 6011 7010 224591.58 0.0
129973 99882949 119 07:09:44 6010 7070 73351.61 0.0
130022 99991245 306 06:18:20 6011 7010 561478.94 1.0
130023 99991245 306 06:20:43 6011 7010 92082.55 1.0
[18093 rows x 6 columns]
y.shape[0]
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
sum_2 = [z.shape[0],y.shape[0]]
count_sum = [z.shape[0],y.shape[0]]
ax.pie(count_sum,autopct='%1.2f%%')
plt.show()
data[["day", "time"]] = data["datetime"].str.split(' ', 0, expand=True)
data[["hours", "minute", "seconds"]] = data["time"].str.split(':', 0, expand=True)
data.day = data.day.astype(int)
data.hours = data.hours.astype(int)
data.minute = data.minute.astype(int)
data.seconds = data.seconds.astype(int)
data.target = data.target.astype(int)
data["weekday"] = data.day % 7
data = data.drop(columns=["datetime", "time"])
print(data.dtypes)
data.head()
client_id int64
code int64
type int64
sum float64
target int64
day int64
hours int64
minute int64
seconds int64
weekday int64
dtype: object
client_idint64
codeint64
0
22899
6011
1
22899
6011
2
22899
6011
3
22899
4814
4
22899
5399
x = data['day'].value_counts()[:100]
y = data['code'].value_counts()[:100]
plt.plot(x,y)
plt.show()
sns.displot(data['day'])
plt.title('Count of Days')
sns.displot(data['weekday'])
plt.title('Count of Week Days')
sns.displot(data['hours'])
plt.title('Count of Hours')
from sklearn.model_selection import train_test_split
X = data.drop(columns=["target"])
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y)
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
cross_val_score(clf, X, y, cv=10)
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
cross_val_score(clf, X, y, cv=10)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
cross_val_score(clf, X, y, cv=10)
param_dist = {
"criterion": ["gini", "entropy"],
"splitter": ["best", "random"],
"max_depth": [1,2,3]
}
random_search = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_dist, n_iter=10)
random_search.fit(X_train, y_train)
random_search.best_params_
clf = DecisionTreeClassifier(splitter = 'best', max_depth = 3, criterion = 'entropy').fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("ROC/AUC:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
precision, recall, _ = precision_recall_curve(y_test, y_pred)
PrecisionRecallDisplay(precision=precision, recall=recall).plot()
ROC/AUC: 0.5333562679535987
Precision: 0.556589409575368
param_dist = {
"n_neighbors": [1,2,3,4,5,6,7,8,9,10],
"weights": ["uniform", "distance"],
"algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
}
random_search = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=param_dist, n_iter=10)
random_search.fit(X_train, y_train)
random_search.best_params_
clf = KNeighborsClassifier(weights = 'uniform', n_neighbors = 1, algorithm = 'kd_tree').fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("ROC/AUC:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
precision, recall, _ = precision_recall_curve(y_test, y_pred)
PrecisionRecallDisplay(precision=precision, recall=recall).plot()
ROC/AUC: 0.9049816169512355
Precision: 0.9028834247302318
param_dist = {
"n_estimators": [100, 200, 300, 400],
"criterion": ["gini", "entropy"]
}
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist, n_iter=1)
random_search.fit(X_train, y_train)
random_search.best_params_
clf = RandomForestClassifier(n_estimators = 200, criterion = 'entropy').fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("ROC/AUC:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
precision, recall, _ = precision_recall_curve(y_test, y_pred)
PrecisionRecallDisplay(precision=precision, recall=recall).plot()
ROC/AUC: 0.6582185657139086
Precision: 0.6746199536897212
param_dist = {
"criterion": ["gini", "entropy"],
"splitter": ["best", "random"],
"max_depth": [1,2,3]
}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_dist)
grid_search.fit(X_train, y_train)
grid_search.best_params_
clf = DecisionTreeClassifier(splitter = 'best', max_depth = 3, criterion = 'gini').fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("ROC/AUC:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
precision, recall, _ = precision_recall_curve(y_test, y_pred)
PrecisionRecallDisplay(precision=precision, recall=recall).plot()
ROC/AUC: 0.5333562679535987
Precision: 0.556589409575368
param_dist = {
"n_neighbors": [1,2,3,4],
"weights": ["uniform", "distance"],
"algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
}
grid_search = GridSearchCV(KNeighborsClassifier(), param_dist)
grid_search.fit(X_train, y_train)
grid_search.best_params_
clf = KNeighborsClassifier(weights = 'uniform', n_neighbors = 1, algorithm = 'auto').fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("ROC/AUC:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
precision, recall, _ = precision_recall_curve(y_test, y_pred)
PrecisionRecallDisplay(precision=precision, recall=recall).plot()
ROC/AUC: 0.9049816169512355
Precision: 0.9028834247302318
param_dist = {
"n_estimators": [100, 200, 300, 400],
"criterion": ["gini", "entropy"]
}
grid_search = GridSearchCV(RandomForestClassifier(), param_dist)
grid_search.fit(X_train, y_train)
grid_search.best_params_
clf = RandomForestClassifier(n_estimators = 200, criterion = 'entropy').fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("ROC/AUC:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
precision, recall, _ = precision_recall_curve(y_test, y_pred)
PrecisionRecallDisplay(precision=precision, recall=recall).plot()
ROC/AUC: 0.6583170960857924
Precision: 0.674439642175093