NianticChallenge-Summer2022

import pandas as pd import sqlite3 import numpy as np import matplotlib.pyplot as plt %matplotlib inline

# sqlite file came from excel to sqlite online converter; # https://www.rebasedata.com/convert-excel-to-sqlite-online con = sqlite3.connect('pokemon.sqlite') cur = con.cursor()

pokemon = pd.read_excel('pokemon_data_science.xlsx') pokemon.head()

# 1) The number of distinct primary types present across Pokemon, pd.read_sql_query("SELECT COUNT(DISTINCT Type_1) FROM pokemon", con)

# 2) The average Total stats for each Pokemon generation, pd.read_sql_query("SELECT Generation, AVG(Total) FROM pokemon GROUP BY Generation", con)

# 3) The white Pokemon with the highest Total stats pd.read_sql_query("SELECT Name, Total, Color FROM (SELECT * FROM pokemon WHERE Color='White')\ WHERE Total = (SELECT max(total) FROM pokemon WHERE color='White')", con)

typeTotals = pokemon.groupby(['Type_1']).mean()[['Total']]\ .sort_values(by=['Total'],ascending=False) typeTotals

X=typeTotals['Total'] Y=len(typeTotals)*[1] plt.figure(figsize=(10, 5)) plt.scatter(X, Y) for i, label in enumerate(list(typeTotals.index)): plt.annotate(label, (X[i], Y[i]), (X[i], Y[i]+.005), ha='left', rotation=70) plt.title("1-D Plot of Average Total Stats by Pokemon Type (Fig 1)") plt.xlabel('Average Total Stats') plt.yticks([])

dragonStats = pokemon[pokemon['Type_1']=='Dragon']['Total'] plt.figure(figsize=(10, 5)) plt.boxplot(dragonStats, vert=False) plt.title("Distribution of Total Stats in Dragon Type (Fig 2)") plt.xlabel('Total Stats') plt.yticks([]) q75, q25 = np.percentile(dragonStats, [75 ,25]) iqr = q75-q25 print("1st Quartile:", q25) print("3rd Quartile:", q75) print("IQR:", iqr) print("Extreme Outlier Lower Boundary:", q25-iqr*1.5) print("Minimum:", min(dragonStats)) print("Extreme Outlier Upper Boundary:", q75+iqr*1.5) print("Maximum:", max(dragonStats))

typeDF = pd.DataFrame({ 'Steel': pokemon[pokemon['Type_1']=='Steel']['Total'], 'Dragon': pokemon[pokemon['Type_1']=='Dragon']['Total'] }) ax = typeDF.plot.kde() plt.xlabel('Total Stats') plt.title("Kernel Density Estimation Plot of Total Stats by Pokemon Type (Fig 3)")

from sklearn.model_selection import train_test_split, KFold from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix

pokemon4 = pokemon.drop(['Catch_Rate', 'Number', 'Name'], axis=1)

#One Hot Encoding pokemon_dum = pd.get_dummies(pokemon4) pokemon_dum.head()

print("% of nan values in PR_Male:", 100*np.count_nonzero(np.isnan(pokemon_dum['Pr_Male']))\ /len(pokemon_dum)) pokemon_dum = pokemon_dum.fillna({"Pr_Male":pokemon_dum["Pr_Male"].mean()})

pokemon4.corr()['isLegendary']

#Random Forest with Cross Validation model = RandomForestClassifier(random_state=42) X = pokemon_dum.drop(['isLegendary'], axis=1) y = pokemon_dum['isLegendary'] kf = KFold(n_splits = 5, shuffle=False) accuracies = [] for train_index, test_index in kf.split(X): x_train, x_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] model.fit(x_train, y_train) y_pred = model.predict(x_test) print(confusion_matrix(y_test, y_pred)) #in clockwise order from top-left: TN, FP, TP, FN accuracies.append(np.count_nonzero(y_pred == y_test)/len(y_test)) print(np.mean(accuracies))

from imblearn.ensemble import BalancedRandomForestClassifier model = BalancedRandomForestClassifier(random_state=42) X = pokemon_dum.drop(['isLegendary'], axis=1) y = pokemon_dum['isLegendary'] kf = KFold(n_splits = 5, shuffle=False) accuracies = [] for train_index, test_index in kf.split(X): x_train, x_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] model.fit(x_train, y_train) y_pred = model.predict(x_test) print(confusion_matrix(y_test, y_pred)) #in clockwise order from top-left: TN, FP, TP, FN accuracies.append(np.count_nonzero(y_pred == y_test)/len(y_test)) print("Average Test Accuracy across k=5 folds:", np.mean(accuracies))