import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cluster import MeanShift, KMeans
df = pd.read_csv("/content/sample_data/highest_earning_teams.csv")
df
df.dtypes
df.describe()
df.info()
df_t = df.sort_values(by="TotalUSDPrize", ascending=False)
sns.set(rc={"figure.figsize": (15,9)})
sns.barplot(data=df_t.head(10), x="TeamName", y="TotalUSDPrize")
def game_code(x):
if x == "Counter-Strike: Global Offensive":
return "CSGO"
if x == "League of Legends":
return "LOL"
if x == "Heroes of the Storm":
return "HOS"
return x
df["Game"] = df["Game"].apply(lambda x: game_code(x))
df_g = pd.pivot_table(data=df, index=["Game"]).reset_index()
sns.boxplot(data=df_t, x="Game", y="TotalUSDPrize")
sns.barplot(data=df_g.sort_values(by="TotalUSDPrize", ascending=False), x="Game", y="TotalUSDPrize")
df_g
sns.scatterplot(data=df_t, x="TotalTournaments", y="TotalUSDPrize")
sns.heatmap(df_t.corr(), annot=True)
df_t["PricePerTournaments"] = df_t["TotalUSDPrize"]
df_t["PricePerTournaments"] = df_t["PricePerTournaments"].apply(lambda x: round(x / df["TotalTournaments"], 2))
df_t = df_t.sort_values(by="PricePerTournaments", ascending=False)
sns.pairplot(df_t)
df_t
df
X = df.drop(columns=["TeamName", "TeamId"])
X = pd.get_dummies(X)
X
X = StandardScaler().fit_transform(X)
pca = PCA(n_components=2)
pca = pca.fit(X)
X_p = pca.transform(X)
X_p
meanshift = MeanShift(max_iter=1000)
meanshift = meanshift.fit(X_p)
X_p = pd.DataFrame(X_p)
X_p
sns.scatterplot(x=X_p[0], y=X_p[1], hue=meanshift.predict(X_p))
df["group"] = meanshift.predict(X_p)
sns.scatterplot(x=df["TotalTournaments"], y=df["TotalUSDPrize"], hue=df["group"])
df
max(meanshift.predict(X_p))
kpca = KernelPCA(n_components = 2, kernel="poly")
kpca = kpca.fit(X)
X_kp = kpca.transform(X)
meanshift = MeanShift(max_iter=1000)
meanshift = meanshift.fit(X_kp)
max(meanshift.predict(X_kp))
X_kp = pd.DataFrame(X_kp)
sns.scatterplot(x=X_kp[0], y=X_kp[1], hue=meanshift.predict(X_kp))
sns.scatterplot(x=df["TotalTournaments"], y=df["TotalUSDPrize"], hue=meanshift.predict(X_kp))
df["group"] = meanshift.predict(X_kp)
df
max(meanshift.predict(X_kp))
X
meanshift = MeanShift(max_iter=1000)
meanshift = meanshift.fit(X)
X_w = meanshift.predict(X)
sns.scatterplot(x=df["TotalTournaments"], y=df["TotalUSDPrize"], hue=meanshift.predict(X))
max(meanshift.predict(X))
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, max_iter=1000)
kmeans = kmeans.fit(X)
wcss.append(kmeans.inertia_)
sns.lineplot(x=range(1, 11), y=wcss)
wcss
pca = PCA(n_components=2)
pca = pca.fit(X)
sns.lineplot(x = range(pca.n_components_), y = pca.explained_variance_ratio_)
X_p = pca.transform(X)
X_p
kmeans = KMeans(n_clusters=5)
kmeans = kmeans.fit(X_p)
df["group"] = kmeans.predict(X_p)
sns.scatterplot(x=df_t["TotalTournaments"], y=df_t["TotalUSDPrize"], hue=kmeans.predict(X_p))
X_p = pd.DataFrame(X_p)
sns.scatterplot(x = X_p[0], y = X_p[1], hue=kmeans.predict(X_p))