import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import os
import warnings
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder
from scipy.cluster import hierarchy
from scipy.spatial import distance_matrix
warnings.filterwarnings('ignore')
os.listdir("data/")
np.random.seed(2021)
df = pd.read_csv('./data/russian_alcohol_consumption.csv')
df.head()
df.describe()
list(df['region'].unique())
dataplot = sns.heatmap(df.corr(), cmap="YlGnBu", annot=True)
plt.show()
NA = pd.DataFrame(
data=[
df.isna().sum().tolist(),
[
"{:.2f}".format(i) + "%"
for i in (df.isna().sum() / df.shape[0] * 100).tolist()
],
],
columns=df.columns,
index=["NA Count", "NA Percent"],
).T.sort_values(by="NA Count", ascending=False)
NA.style.background_gradient(cmap="seismic", subset=["NA Count"])
df.fillna(method="pad", inplace=True)
df.isna().sum()
geo = pd.read_csv("data/russian_geo.csv")
df_geo = (
pd.merge(df, geo, on="region")
.groupby("region")
.mean()
.reset_index()
.sort_values("beer", ascending=False)
)
df_geo["total"] = (
df_geo["wine"]
+ df_geo["brandy"]
+ df_geo["vodka"]
+ df_geo["beer"]
+ df_geo["champagne"]
)
df_geo["text"] = (
df_geo["region"] + "<br> Alcohol consumption:" + (df_geo["total"]).astype(str)
)
# limits = [(0,99),(100,299),(300,499),(500,999),(1000,3000)]
limits = [(0, 10), (11, 21), (22, 84)]
colors = ["royalblue", "crimson", "lightseagreen"]
scale = 500
fig = go.Figure()
for i in range(len(limits)):
lim = limits[i]
df_sub = df_geo[lim[0] : lim[1]]
fig.add_trace(
go.Scattergeo(
lon=df_sub["lon"],
lat=df_sub["lat"],
text=df_sub["text"],
marker=dict(
size=df_sub["total"],
color=colors[i],
line_color="rgb(40,40,40)",
line_width=0.5,
sizemode="area",
),
name="Top {0} - {1}".format(lim[0] + 1, lim[1]),
)
)
fig.update_layout(
title_text="Alcohol Consumption in Russia",
showlegend=True,
legend_title="Beer 1 liter/percapita",
legend_title_font_size=14,
geo=dict(
scope="world",
landcolor="rgb(217, 217, 217)",
lonaxis=dict(range=[23.6985, 80.06269]),
lataxis=dict(range=[33.72197, 80.7293]),
),
)
fig.show()
fig, axes = plt.subplots(1, 5, figsize=(15, 6), sharey=True)
for x in range(1, 6, 1):
column_name = df.columns[x + 1]
sns.lineplot(data=df, x="year", y=f"{column_name}", ax=axes[x - 1])
axes[x - 1].set_title(f"{column_name} consumption")
axes[x - 1].set_xlabel("Year")
axes[x - 1].set_ylabel("litres per capita")
year_df = (
df.groupby("year")
.mean()
.stack(0)
.reset_index()
.rename(columns={"level_1": "type", 0: "consumption"})
)
# year_df['total'] = year_df['wine']+year_df['brandy']+year_df['vodka']+year_df['beer']+year_df['champagne']
fig = px.scatter(
year_df,
x="type",
y="consumption",
animation_frame="year",
color="type",
size="consumption",
size_max=55,
# range_x=[100,100000],
range_y=[0, 60],
)
# fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()
X = df.groupby(["region"])["wine", "beer", "vodka", "champagne", "brandy"].mean()
clusters = []
for i in range(1, 10):
km = KMeans(n_clusters=i).fit(X)
clusters.append(km.inertia_)
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x=list(range(1, 10)), y=clusters, ax=ax)
ax.set_title("Searching for Elbow")
ax.set_xlabel("Clusters")
ax.set_ylabel("Inertia");
km = KMeans(n_clusters=2).fit(X)
X["Labels"] = km.predict(X)
plt.figure(figsize=(12, 8))
sns.scatterplot(X["wine"], X["beer"], hue=X["Labels"], palette="Set1", alpha=0.7)
plt.title("KMeans with 2 Clusters")
plt.show()
km = KMeans(n_clusters=3).fit(X)
X["Labels"] = km.predict(X)
plt.figure(figsize=(12, 8))
sns.scatterplot(X["wine"], X["beer"], hue=X["Labels"], palette="Set1", alpha=0.7)
plt.title("KMeans with 3 Clusters")
plt.show()
fig, axes = plt.subplots(1, 4, figsize=(15, 6))
sns.scatterplot(
X["wine"], X["beer"], hue=X["Labels"], palette="Set1", alpha=0.7, ax=axes[0]
)
axes[0].set_title("beer V/S wine")
sns.scatterplot(
X["vodka"], X["champagne"], hue=X["Labels"], palette="Set1", alpha=0.7, ax=axes[1]
)
axes[1].set_title("vodka V/S champagne")
sns.scatterplot(
X["wine"], X["champagne"], hue=X["Labels"], palette="Set1", alpha=0.7, ax=axes[2]
)
axes[2].set_title("wine V/S champagne")
sns.scatterplot(
X["vodka"], X["brandy"], hue=X["Labels"], palette="Set1", alpha=0.7, ax=axes[3]
)
axes[3].set_title("vodka V/S brandy")
plt.show()
fig = plt.figure(figsize=(10, 8))
ax1 = fig.add_subplot(151)
sns.swarmplot(x="Labels", y="beer", data=X, ax=ax1)
ax1.set_title("beer")
ax2 = fig.add_subplot(152)
sns.swarmplot(x="Labels", y="vodka", data=X, ax=ax2)
ax2.set_title("vodka")
ax3 = fig.add_subplot(153)
sns.swarmplot(x="Labels", y="wine", data=X, ax=ax3)
ax3.set_title("wine")
ax4 = fig.add_subplot(154)
sns.swarmplot(x="Labels", y="champagne", data=X, ax=ax4)
ax4.set_title("champagne")
ax5 = fig.add_subplot(155)
sns.swarmplot(x="Labels", y="brandy", data=X, ax=ax5)
ax5.set_title("brandy")
plt.show()
com_X = X["beer"] + X["wine"] + X["champagne"] + X["vodka"] + X["brandy"]
sns.swarmplot(x="Labels", y=com_X, data=X)
plt.title("Labels V/S All drinks");
dist = distance_matrix(X, X)
Z = hierarchy.linkage(dist, "average")
plt.figure(figsize=(12, 8))
dendro = hierarchy.dendrogram(Z, leaf_rotation=0, leaf_font_size=6, orientation="right")
hc = AgglomerativeClustering(n_clusters=4, affinity="euclidean", linkage="ward")
y_hc = hc.fit_predict(X)
plt.figure(figsize=(12, 8))
sns.scatterplot(X["wine"], X["beer"], hue=y_hc, palette="Set1", alpha=0.7)
plt.title("Agglomerative Clustering with 4 Clusters")
plt.show()
X["H_Pop"] = y_hc
X["H_Pop"] = X["H_Pop"].replace(2, "Low")
X["H_Pop"] = X["H_Pop"].replace(1, "Medium")
X["H_Pop"] = X["H_Pop"].replace(3, "High")
X["H_Pop"] = X["H_Pop"].replace(0, "Top")
X[X["H_Pop"] == "Top"].head(2)
X.rename(columns={"Labels": "KM_Pop"}, inplace=True)
X["Total_Drinks"] = com_X
X["KM_Pop"] = X["KM_Pop"].replace(2, "Low")
X["KM_Pop"] = X["KM_Pop"].replace(0, "Medium")
X["KM_Pop"] = X["KM_Pop"].replace(1, "High")
X[X["KM_Pop"] == "High"].sort_values("Total_Drinks", ascending=False).head(11)
city_names_total = (
X[(X["KM_Pop"] == "High") & (X["H_Pop"] == "Top")]
.sort_values("Total_Drinks", ascending=False)
.head(11)
.index
)
list(city_names_total.drop("Saint Petersburg"))
city_names_wine = (
X[(X["KM_Pop"] == "High") & (X["H_Pop"] == "Top")]
.sort_values("wine", ascending=False)
.head(11)
.index
)
list(city_names_wine.drop("Saint Petersburg"))
!nbqa black notebook.ipynb
reformatted notebook.ipynb
All done! ✨ 🍰 ✨
1 file reformatted.