import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, precision_score, precision_recall_curve, PrecisionRecallDisplay
transactions = pd.read_csv('transactions.csv', sep = ";")
codes = pd.read_csv('codes.csv', sep = ";")
types = pd.read_csv('types.csv', sep = ";")
train = pd.read_csv('train_set.csv', sep=";")
data = transactions.set_index("client_id").join(train.set_index("client_id")).reset_index().dropna()
data.head()
data.shape
data.info()
data.isnull().sum()
data.dropna().describe()
data.dropna(how="any", inplace=True)
data.shape
data.describe()
# To get dataset of only numbers
data_num = data.select_dtypes(include = ['float64', 'int64'])
data_num.head()
# Get Most Frequent value for all columns
from pandas.api.types import is_categorical_dtype
for col in data.columns:
print(col, end=' - \n')
print('_' * 50)
if col in ['Type description'] or is_categorical_dtype(col):
display(pd.DataFrame(data[col].astype('str').value_counts().sort_values(ascending=False).head(3)))
else:
display(pd.DataFrame(data[col].value_counts().sort_values(ascending=False).head(5)))
data_num.hist(figsize=(15, 15), bins=40);
corr = data.corr()
g = sns.heatmap(corr, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, fmt='.2f', cmap='coolwarm')
sns.despine()
g.figure.set_size_inches(14,10)
plt.show()
z = data[data['sum']<0]
print(z)
z.shape[0]
y = data[data['sum']>0]
print(y)
y.shape[0]
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
sum_2 = [z.shape[0],y.shape[0]]
count_sum = [z.shape[0],y.shape[0]]
ax.pie(count_sum,autopct='%1.2f%%')
plt.show()
data[["day", "time"]] = data["datetime"].str.split(' ', 0, expand=True)
data[["hours", "minute", "seconds"]] = data["time"].str.split(':', 0, expand=True)
data.day = data.day.astype(int)
data.hours = data.hours.astype(int)
data.minute = data.minute.astype(int)
data.seconds = data.seconds.astype(int)
data.target = data.target.astype(int)
data["weekday"] = data.day % 7
data = data.drop(columns=["datetime", "time"])
print(data.dtypes)
data.head()
x = data['day'].value_counts()[:100]
y = data['code'].value_counts()[:100]
plt.plot(x,y)
plt.show()
sns.displot(data['day'])
plt.title('Count of Days')
sns.displot(data['weekday'])
plt.title('Count of Week Days')
sns.displot(data['hours'])
plt.title('Count of Hours')
from sklearn.model_selection import train_test_split
X = data.drop(columns=["target"])
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y)
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
cross_val_score(clf, X, y, cv=10)
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
cross_val_score(clf, X, y, cv=10)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
cross_val_score(clf, X, y, cv=10)
param_dist = {
"criterion": ["gini", "entropy"],
"splitter": ["best", "random"],
"max_depth": [1,2,3]
}
random_search = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_dist, n_iter=10)
random_search.fit(X_train, y_train)
random_search.best_params_
clf = DecisionTreeClassifier(splitter = 'best', max_depth = 3, criterion = 'entropy').fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("ROC/AUC:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
precision, recall, _ = precision_recall_curve(y_test, y_pred)
PrecisionRecallDisplay(precision=precision, recall=recall).plot()
param_dist = {
"n_neighbors": [1,2,3,4,5,6,7,8,9,10],
"weights": ["uniform", "distance"],
"algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
}
random_search = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=param_dist, n_iter=10)
random_search.fit(X_train, y_train)
random_search.best_params_
clf = KNeighborsClassifier(weights = 'uniform', n_neighbors = 1, algorithm = 'kd_tree').fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("ROC/AUC:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
precision, recall, _ = precision_recall_curve(y_test, y_pred)
PrecisionRecallDisplay(precision=precision, recall=recall).plot()
param_dist = {
"n_estimators": [100, 200, 300, 400],
"criterion": ["gini", "entropy"]
}
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist, n_iter=1)
random_search.fit(X_train, y_train)
random_search.best_params_
clf = RandomForestClassifier(n_estimators = 200, criterion = 'entropy').fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("ROC/AUC:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
precision, recall, _ = precision_recall_curve(y_test, y_pred)
PrecisionRecallDisplay(precision=precision, recall=recall).plot()
param_dist = {
"criterion": ["gini", "entropy"],
"splitter": ["best", "random"],
"max_depth": [1,2,3]
}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_dist)
grid_search.fit(X_train, y_train)
grid_search.best_params_
clf = DecisionTreeClassifier(splitter = 'best', max_depth = 3, criterion = 'gini').fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("ROC/AUC:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
precision, recall, _ = precision_recall_curve(y_test, y_pred)
PrecisionRecallDisplay(precision=precision, recall=recall).plot()
param_dist = {
"n_neighbors": [1,2,3,4],
"weights": ["uniform", "distance"],
"algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
}
grid_search = GridSearchCV(KNeighborsClassifier(), param_dist)
grid_search.fit(X_train, y_train)
grid_search.best_params_
clf = KNeighborsClassifier(weights = 'uniform', n_neighbors = 1, algorithm = 'auto').fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("ROC/AUC:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
precision, recall, _ = precision_recall_curve(y_test, y_pred)
PrecisionRecallDisplay(precision=precision, recall=recall).plot()
param_dist = {
"n_estimators": [100, 200, 300, 400],
"criterion": ["gini", "entropy"]
}
grid_search = GridSearchCV(RandomForestClassifier(), param_dist)
grid_search.fit(X_train, y_train)
grid_search.best_params_
clf = RandomForestClassifier(n_estimators = 200, criterion = 'entropy').fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("ROC/AUC:", roc_auc_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
precision, recall, _ = precision_recall_curve(y_test, y_pred)
PrecisionRecallDisplay(precision=precision, recall=recall).plot()