Alcoholic Drinks Promotion in Russia

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import plotly.graph_objects as go import plotly.express as px import os import warnings from sklearn.cluster import KMeans from sklearn.cluster import AgglomerativeClustering from sklearn.preprocessing import LabelEncoder from scipy.cluster import hierarchy from scipy.spatial import distance_matrix warnings.filterwarnings('ignore') os.listdir("data/") np.random.seed(2021)

df = pd.read_csv('./data/russian_alcohol_consumption.csv') df.head()

df.describe()

list(df['region'].unique())

dataplot = sns.heatmap(df.corr(), cmap="YlGnBu", annot=True) plt.show()

NA = pd.DataFrame( data=[ df.isna().sum().tolist(), [ "{:.2f}".format(i) + "%" for i in (df.isna().sum() / df.shape[0] * 100).tolist() ], ], columns=df.columns, index=["NA Count", "NA Percent"], ).T.sort_values(by="NA Count", ascending=False) NA.style.background_gradient(cmap="seismic", subset=["NA Count"])

df.fillna(method="pad", inplace=True) df.isna().sum()

geo = pd.read_csv("data/russian_geo.csv") df_geo = ( pd.merge(df, geo, on="region") .groupby("region") .mean() .reset_index() .sort_values("beer", ascending=False) )

df_geo["total"] = ( df_geo["wine"] + df_geo["brandy"] + df_geo["vodka"] + df_geo["beer"] + df_geo["champagne"] ) df_geo["text"] = ( df_geo["region"] + "<br> Alcohol consumption:" + (df_geo["total"]).astype(str) ) # limits = [(0,99),(100,299),(300,499),(500,999),(1000,3000)] limits = [(0, 10), (11, 21), (22, 84)] colors = ["royalblue", "crimson", "lightseagreen"] scale = 500 fig = go.Figure() for i in range(len(limits)): lim = limits[i] df_sub = df_geo[lim[0] : lim[1]] fig.add_trace( go.Scattergeo( lon=df_sub["lon"], lat=df_sub["lat"], text=df_sub["text"], marker=dict( size=df_sub["total"], color=colors[i], line_color="rgb(40,40,40)", line_width=0.5, sizemode="area", ), name="Top {0} - {1}".format(lim[0] + 1, lim[1]), ) ) fig.update_layout( title_text="Alcohol Consumption in Russia", showlegend=True, legend_title="Beer 1 liter/percapita", legend_title_font_size=14, geo=dict( scope="world", landcolor="rgb(217, 217, 217)", lonaxis=dict(range=[23.6985, 80.06269]), lataxis=dict(range=[33.72197, 80.7293]), ), ) fig.show()

fig, axes = plt.subplots(1, 5, figsize=(15, 6), sharey=True) for x in range(1, 6, 1): column_name = df.columns[x + 1] sns.lineplot(data=df, x="year", y=f"{column_name}", ax=axes[x - 1]) axes[x - 1].set_title(f"{column_name} consumption") axes[x - 1].set_xlabel("Year") axes[x - 1].set_ylabel("litres per capita")

year_df = ( df.groupby("year") .mean() .stack(0) .reset_index() .rename(columns={"level_1": "type", 0: "consumption"}) ) # year_df['total'] = year_df['wine']+year_df['brandy']+year_df['vodka']+year_df['beer']+year_df['champagne'] fig = px.scatter( year_df, x="type", y="consumption", animation_frame="year", color="type", size="consumption", size_max=55, # range_x=[100,100000], range_y=[0, 60], ) # fig["layout"].pop("updatemenus") # optional, drop animation buttons fig.show()

X = df.groupby(["region"])["wine", "beer", "vodka", "champagne", "brandy"].mean() clusters = [] for i in range(1, 10): km = KMeans(n_clusters=i).fit(X) clusters.append(km.inertia_) fig, ax = plt.subplots(figsize=(12, 8)) sns.lineplot(x=list(range(1, 10)), y=clusters, ax=ax) ax.set_title("Searching for Elbow") ax.set_xlabel("Clusters") ax.set_ylabel("Inertia");

km = KMeans(n_clusters=2).fit(X) X["Labels"] = km.predict(X) plt.figure(figsize=(12, 8)) sns.scatterplot(X["wine"], X["beer"], hue=X["Labels"], palette="Set1", alpha=0.7) plt.title("KMeans with 2 Clusters") plt.show()

km = KMeans(n_clusters=3).fit(X) X["Labels"] = km.predict(X) plt.figure(figsize=(12, 8)) sns.scatterplot(X["wine"], X["beer"], hue=X["Labels"], palette="Set1", alpha=0.7) plt.title("KMeans with 3 Clusters") plt.show()

fig, axes = plt.subplots(1, 4, figsize=(15, 6)) sns.scatterplot( X["wine"], X["beer"], hue=X["Labels"], palette="Set1", alpha=0.7, ax=axes[0] ) axes[0].set_title("beer V/S wine") sns.scatterplot( X["vodka"], X["champagne"], hue=X["Labels"], palette="Set1", alpha=0.7, ax=axes[1] ) axes[1].set_title("vodka V/S champagne") sns.scatterplot( X["wine"], X["champagne"], hue=X["Labels"], palette="Set1", alpha=0.7, ax=axes[2] ) axes[2].set_title("wine V/S champagne") sns.scatterplot( X["vodka"], X["brandy"], hue=X["Labels"], palette="Set1", alpha=0.7, ax=axes[3] ) axes[3].set_title("vodka V/S brandy") plt.show()

fig = plt.figure(figsize=(10, 8)) ax1 = fig.add_subplot(151) sns.swarmplot(x="Labels", y="beer", data=X, ax=ax1) ax1.set_title("beer") ax2 = fig.add_subplot(152) sns.swarmplot(x="Labels", y="vodka", data=X, ax=ax2) ax2.set_title("vodka") ax3 = fig.add_subplot(153) sns.swarmplot(x="Labels", y="wine", data=X, ax=ax3) ax3.set_title("wine") ax4 = fig.add_subplot(154) sns.swarmplot(x="Labels", y="champagne", data=X, ax=ax4) ax4.set_title("champagne") ax5 = fig.add_subplot(155) sns.swarmplot(x="Labels", y="brandy", data=X, ax=ax5) ax5.set_title("brandy") plt.show()

com_X = X["beer"] + X["wine"] + X["champagne"] + X["vodka"] + X["brandy"] sns.swarmplot(x="Labels", y=com_X, data=X) plt.title("Labels V/S All drinks");

dist = distance_matrix(X, X) Z = hierarchy.linkage(dist, "average")

plt.figure(figsize=(12, 8)) dendro = hierarchy.dendrogram(Z, leaf_rotation=0, leaf_font_size=6, orientation="right")

hc = AgglomerativeClustering(n_clusters=4, affinity="euclidean", linkage="ward") y_hc = hc.fit_predict(X)

plt.figure(figsize=(12, 8)) sns.scatterplot(X["wine"], X["beer"], hue=y_hc, palette="Set1", alpha=0.7) plt.title("Agglomerative Clustering with 4 Clusters") plt.show()

X["H_Pop"] = y_hc X["H_Pop"] = X["H_Pop"].replace(2, "Low") X["H_Pop"] = X["H_Pop"].replace(1, "Medium") X["H_Pop"] = X["H_Pop"].replace(3, "High") X["H_Pop"] = X["H_Pop"].replace(0, "Top") X[X["H_Pop"] == "Top"].head(2)

X.rename(columns={"Labels": "KM_Pop"}, inplace=True) X["Total_Drinks"] = com_X X["KM_Pop"] = X["KM_Pop"].replace(2, "Low") X["KM_Pop"] = X["KM_Pop"].replace(0, "Medium") X["KM_Pop"] = X["KM_Pop"].replace(1, "High") X[X["KM_Pop"] == "High"].sort_values("Total_Drinks", ascending=False).head(11)

city_names_total = ( X[(X["KM_Pop"] == "High") & (X["H_Pop"] == "Top")] .sort_values("Total_Drinks", ascending=False) .head(11) .index ) list(city_names_total.drop("Saint Petersburg"))

city_names_wine = ( X[(X["KM_Pop"] == "High") & (X["H_Pop"] == "Top")] .sort_values("wine", ascending=False) .head(11) .index ) list(city_names_wine.drop("Saint Petersburg"))

!nbqa black notebook.ipynb