import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import os
import warnings
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder
from scipy.cluster import hierarchy
from scipy.spatial import distance_matrix
warnings.filterwarnings('ignore')
os.listdir("data/")
np.random.seed(2021)
df = pd.read_csv('./data/russian_alcohol_consumption.csv')
df.head()
df.describe()
list(df['region'].unique())
dataplot = sns.heatmap(df.corr(), cmap="YlGnBu", annot=True)
plt.show()
NA = pd.DataFrame(
data=[
df.isna().sum().tolist(),
[
"{:.2f}".format(i) + "%"
for i in (df.isna().sum() / df.shape[0] * 100).tolist()
],
],
columns=df.columns,
index=["NA Count", "NA Percent"],
).T.sort_values(by="NA Count", ascending=False)
NA.style.background_gradient(cmap="seismic", subset=["NA Count"])
df.fillna(method="pad", inplace=True)
df.isna().sum()
geo = pd.read_csv("data/russian_geo.csv")
df_geo = (
pd.merge(df, geo, on="region")
.groupby("region")
.mean()
.reset_index()
.sort_values("beer", ascending=False)
)
df_geo["total"] = (
df_geo["wine"]
+ df_geo["brandy"]
+ df_geo["vodka"]
+ df_geo["beer"]
+ df_geo["champagne"]
)
df_geo["text"] = (
df_geo["region"] + "<br> Alcohol consumption:" + (df_geo["total"]).astype(str)
)
# limits = [(0,99),(100,299),(300,499),(500,999),(1000,3000)]
limits = [(0, 10), (11, 21), (22, 84)]
colors = ["royalblue", "crimson", "lightseagreen"]
scale = 500
fig = go.Figure()
for i in range(len(limits)):
lim = limits[i]
df_sub = df_geo[lim[0] : lim[1]]
fig.add_trace(
go.Scattergeo(
lon=df_sub["lon"],
lat=df_sub["lat"],
text=df_sub["text"],
marker=dict(
size=df_sub["total"],
color=colors[i],
line_color="rgb(40,40,40)",
line_width=0.5,
sizemode="area",
),
name="Top {0} - {1}".format(lim[0] + 1, lim[1]),
)
)
fig.update_layout(
title_text="Alcohol Consumption in Russia",
showlegend=True,
legend_title="Beer 1 liter/percapita",
legend_title_font_size=14,
geo=dict(
scope="world",
landcolor="rgb(217, 217, 217)",
lonaxis=dict(range=[23.6985, 80.06269]),
lataxis=dict(range=[33.72197, 80.7293]),
),
)
fig.show()
fig, axes = plt.subplots(1, 5, figsize=(15, 6), sharey=True)
for x in range(1, 6, 1):
column_name = df.columns[x + 1]
sns.lineplot(data=df, x="year", y=f"{column_name}", ax=axes[x - 1])
axes[x - 1].set_title(f"{column_name} consumption")
axes[x - 1].set_xlabel("Year")
axes[x - 1].set_ylabel("litres per capita")
year_df = (
df.groupby("year")
.mean()
.stack(0)
.reset_index()
.rename(columns={"level_1": "type", 0: "consumption"})
)
# year_df['total'] = year_df['wine']+year_df['brandy']+year_df['vodka']+year_df['beer']+year_df['champagne']
fig = px.scatter(
year_df,
x="type",
y="consumption",
animation_frame="year",
color="type",
size="consumption",
size_max=55,
# range_x=[100,100000],
range_y=[0, 60],
)
# fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()
X = df.groupby(["region"])["wine", "beer", "vodka", "champagne", "brandy"].mean()
clusters = []
for i in range(1, 10):
km = KMeans(n_clusters=i).fit(X)
clusters.append(km.inertia_)
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x=list(range(1, 10)), y=clusters, ax=ax)
ax.set_title("Searching for Elbow")
ax.set_xlabel("Clusters")
ax.set_ylabel("Inertia");
km = KMeans(n_clusters=2).fit(X)
X["Labels"] = km.predict(X)
plt.figure(figsize=(12, 8))
sns.scatterplot(X["wine"], X["beer"], hue=X["Labels"], palette="Set1", alpha=0.7)
plt.title("KMeans with 2 Clusters")
plt.show()
km = KMeans(n_clusters=3).fit(X)
X["Labels"] = km.predict(X)
plt.figure(figsize=(12, 8))
sns.scatterplot(X["wine"], X["beer"], hue=X["Labels"], palette="Set1", alpha=0.7)
plt.title("KMeans with 3 Clusters")
plt.show()
fig, axes = plt.subplots(1, 4, figsize=(15, 6))
sns.scatterplot(
X["wine"], X["beer"], hue=X["Labels"], palette="Set1", alpha=0.7, ax=axes[0]
)
axes[0].set_title("beer V/S wine")
sns.scatterplot(
X["vodka"], X["champagne"], hue=X["Labels"], palette="Set1", alpha=0.7, ax=axes[1]
)
axes[1].set_title("vodka V/S champagne")
sns.scatterplot(
X["wine"], X["champagne"], hue=X["Labels"], palette="Set1", alpha=0.7, ax=axes[2]
)
axes[2].set_title("wine V/S champagne")
sns.scatterplot(
X["vodka"], X["brandy"], hue=X["Labels"], palette="Set1", alpha=0.7, ax=axes[3]
)
axes[3].set_title("vodka V/S brandy")
plt.show()
fig = plt.figure(figsize=(10, 8))
ax1 = fig.add_subplot(151)
sns.swarmplot(x="Labels", y="beer", data=X, ax=ax1)
ax1.set_title("beer")
ax2 = fig.add_subplot(152)
sns.swarmplot(x="Labels", y="vodka", data=X, ax=ax2)
ax2.set_title("vodka")
ax3 = fig.add_subplot(153)
sns.swarmplot(x="Labels", y="wine", data=X, ax=ax3)
ax3.set_title("wine")
ax4 = fig.add_subplot(154)
sns.swarmplot(x="Labels", y="champagne", data=X, ax=ax4)
ax4.set_title("champagne")
ax5 = fig.add_subplot(155)
sns.swarmplot(x="Labels", y="brandy", data=X, ax=ax5)
ax5.set_title("brandy")
plt.show()
com_X = X["beer"] + X["wine"] + X["champagne"] + X["vodka"] + X["brandy"]
sns.swarmplot(x="Labels", y=com_X, data=X)
plt.title("Labels V/S All drinks");
dist = distance_matrix(X, X)
Z = hierarchy.linkage(dist, "average")
plt.figure(figsize=(12, 8))
dendro = hierarchy.dendrogram(Z, leaf_rotation=0, leaf_font_size=6, orientation="right")
hc = AgglomerativeClustering(n_clusters=4, affinity="euclidean", linkage="ward")
y_hc = hc.fit_predict(X)
plt.figure(figsize=(12, 8))
sns.scatterplot(X["wine"], X["beer"], hue=y_hc, palette="Set1", alpha=0.7)
plt.title("Agglomerative Clustering with 4 Clusters")
plt.show()
X["H_Pop"] = y_hc
X["H_Pop"] = X["H_Pop"].replace(2, "Low")
X["H_Pop"] = X["H_Pop"].replace(1, "Medium")
X["H_Pop"] = X["H_Pop"].replace(3, "High")
X["H_Pop"] = X["H_Pop"].replace(0, "Top")
X[X["H_Pop"] == "Top"].head(2)
X.rename(columns={"Labels": "KM_Pop"}, inplace=True)
X["Total_Drinks"] = com_X
X["KM_Pop"] = X["KM_Pop"].replace(2, "Low")
X["KM_Pop"] = X["KM_Pop"].replace(0, "Medium")
X["KM_Pop"] = X["KM_Pop"].replace(1, "High")
X[X["KM_Pop"] == "High"].sort_values("Total_Drinks", ascending=False).head(11)
city_names_total = (
X[(X["KM_Pop"] == "High") & (X["H_Pop"] == "Top")]
.sort_values("Total_Drinks", ascending=False)
.head(11)
.index
)
list(city_names_total.drop("Saint Petersburg"))
city_names_wine = (
X[(X["KM_Pop"] == "High") & (X["H_Pop"] == "Top")]
.sort_values("wine", ascending=False)
.head(11)
.index
)
list(city_names_wine.drop("Saint Petersburg"))
!nbqa black notebook.ipynb