import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# sqlite file came from excel to sqlite online converter;
# https://www.rebasedata.com/convert-excel-to-sqlite-online
con = sqlite3.connect('pokemon.sqlite')
cur = con.cursor()
pokemon = pd.read_excel('pokemon_data_science.xlsx')
pokemon.head()
# 1) The number of distinct primary types present across Pokemon,
pd.read_sql_query("SELECT COUNT(DISTINCT Type_1) FROM pokemon", con)
# 2) The average Total stats for each Pokemon generation,
pd.read_sql_query("SELECT Generation, AVG(Total) FROM pokemon GROUP BY Generation", con)
# 3) The white Pokemon with the highest Total stats
pd.read_sql_query("SELECT Name, Total, Color FROM (SELECT * FROM pokemon WHERE Color='White')\
WHERE Total = (SELECT max(total) FROM pokemon WHERE color='White')", con)
typeTotals = pokemon.groupby(['Type_1']).mean()[['Total']]\
.sort_values(by=['Total'],ascending=False)
typeTotals
X=typeTotals['Total']
Y=len(typeTotals)*[1]
plt.figure(figsize=(10, 5))
plt.scatter(X, Y)
for i, label in enumerate(list(typeTotals.index)):
plt.annotate(label, (X[i], Y[i]), (X[i], Y[i]+.005), ha='left', rotation=70)
plt.title("1-D Plot of Average Total Stats by Pokemon Type (Fig 1)")
plt.xlabel('Average Total Stats')
plt.yticks([])
dragonStats = pokemon[pokemon['Type_1']=='Dragon']['Total']
plt.figure(figsize=(10, 5))
plt.boxplot(dragonStats, vert=False)
plt.title("Distribution of Total Stats in Dragon Type (Fig 2)")
plt.xlabel('Total Stats')
plt.yticks([])
q75, q25 = np.percentile(dragonStats, [75 ,25])
iqr = q75-q25
print("1st Quartile:", q25)
print("3rd Quartile:", q75)
print("IQR:", iqr)
print("Extreme Outlier Lower Boundary:", q25-iqr*1.5)
print("Minimum:", min(dragonStats))
print("Extreme Outlier Upper Boundary:", q75+iqr*1.5)
print("Maximum:", max(dragonStats))
typeDF = pd.DataFrame({
'Steel': pokemon[pokemon['Type_1']=='Steel']['Total'],
'Dragon': pokemon[pokemon['Type_1']=='Dragon']['Total']
})
ax = typeDF.plot.kde()
plt.xlabel('Total Stats')
plt.title("Kernel Density Estimation Plot of Total Stats by Pokemon Type (Fig 3)")
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
pokemon4 = pokemon.drop(['Catch_Rate', 'Number', 'Name'], axis=1)
#One Hot Encoding
pokemon_dum = pd.get_dummies(pokemon4)
pokemon_dum.head()
print("% of nan values in PR_Male:", 100*np.count_nonzero(np.isnan(pokemon_dum['Pr_Male']))\
/len(pokemon_dum))
pokemon_dum = pokemon_dum.fillna({"Pr_Male":pokemon_dum["Pr_Male"].mean()})
pokemon4.corr()['isLegendary']
#Random Forest with Cross Validation
model = RandomForestClassifier(random_state=42)
X = pokemon_dum.drop(['isLegendary'], axis=1)
y = pokemon_dum['isLegendary']
kf = KFold(n_splits = 5, shuffle=False)
accuracies = []
for train_index, test_index in kf.split(X):
x_train, x_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(confusion_matrix(y_test, y_pred)) #in clockwise order from top-left: TN, FP, TP, FN
accuracies.append(np.count_nonzero(y_pred == y_test)/len(y_test))
print(np.mean(accuracies))
from imblearn.ensemble import BalancedRandomForestClassifier
model = BalancedRandomForestClassifier(random_state=42)
X = pokemon_dum.drop(['isLegendary'], axis=1)
y = pokemon_dum['isLegendary']
kf = KFold(n_splits = 5, shuffle=False)
accuracies = []
for train_index, test_index in kf.split(X):
x_train, x_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(confusion_matrix(y_test, y_pred)) #in clockwise order from top-left: TN, FP, TP, FN
accuracies.append(np.count_nonzero(y_pred == y_test)/len(y_test))
print("Average Test Accuracy across k=5 folds:", np.mean(accuracies))