E-Commerce

import pandas as pd import seaborn as sns import numpy as np import sys from plotly import express as px from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans from sklearn.decomposition import PCA

df = pd.read_csv("/content/sample_data/ventas-por-factura.csv")

df.dtypes

df.describe()

df.info()

df.isnull().sum()

df_i_c = pd.read_csv("/content/sample_data/countries_codes_and_coordinates.csv")

df_i_c

df = df.dropna(axis="rows")

df["Fecha de factura"] = df["Fecha de factura"].apply(lambda x: pd.to_datetime(x))

def n_p(x): if "," in x: return float(x.replace(",", "")) return float(x)

df["Cantidad"] = df["Cantidad"].apply(lambda x: n_p(x) if n_p(x) > 0 else None)

df = df.dropna(axis="rows")

df["Cantidad"]

df["Monto"] = df["Monto"].apply(lambda x: n_p(x) if n_p(x) > 0 else None)

df = df.dropna(axis="rows")

df["month"] = df["Fecha de factura"].apply(lambda x: (x.month) if x.year != 2020 else 0) df["year"] = df["Fecha de factura"].apply(lambda x: x.year)

df_t = pd.pivot_table(df, index=["month", "year"], aggfunc="mean") df_t = df_t.reset_index()

sns.set(rc={"figure.figsize": (15, 9)}) sns.barplot(data=df_t,x="month", y="Monto")

df_t["Monto"].mean()

df_n = df[df["País"] != "United Kingdom"]

df.dtypes

df["ID Cliente"] = df["ID Cliente"].astype(str)

df_t = pd.pivot_table(df, index=["month"], aggfunc="count")

df_t = df_t.reset_index()

sns.lineplot(x=df_t["month"], y=df_t["Fecha de factura"])

df_t["ID Cliente"].mean()

df_c = pd.pivot_table(df, index=["ID Cliente"], aggfunc="sum")

df_c = df_c.reset_index()

df_c = df_c.sort_values(by="Monto", ascending=False)

sns.barplot(x=df_c["ID Cliente"].head(10), y=df_c["Monto"].head(10))

df_c = pd.pivot_table(df, index=["ID Cliente"], aggfunc="sum") df_c = df_c.reset_index()

df_c = df_c.sort_values(by="Cantidad", ascending=False)

sns.barplot(x=df_c["ID Cliente"].head(10), y=df_c["Cantidad"].head(10))

df_c

df_c = pd.pivot_table(df, index=["ID Cliente"], aggfunc="mean") df_c = df_c.reset_index()

df_c

df_c["Monto"].mean()

df_n = df.sort_values(by="Fecha de factura")

df_n = df_n[["Fecha de factura", "ID Cliente"]].drop_duplicates(subset=["ID Cliente"])

df_n["month_cohort"] = df["Fecha de factura"].apply(lambda x: x.month if x.year != 2020 else 0)

df_n

df_t = df.sort_values(by="Fecha de factura")

df_n = df_n.pivot_table(df_n, index=["ID Cliente"])

df_c = pd.merge(right=df, left=df_n, on="ID Cliente")

df_c[df_c["ID Cliente"] == "14646.0"]

df_c = pd.pivot_table(df_c, index=["month_cohort"],columns=["month"],aggfunc="count")

df_c["Cantidad"]

sns.heatmap(df_c["Fecha de factura"], annot=True, fmt=".2f")

df["pais_codigo"] = df["País"].apply(lambda x: str(df_i_c[df_i_c["Country"] == x]["Alpha-3 code"]).replace('"', "").split(" ")[5].split("\n")[0])

df_c = df.pivot_table(df, index=["pais_codigo", "País"], aggfunc="count") df_c = df_c.reset_index()

df_c["Cantidad"]

# El grafico no puede verse por github ya que no soporta la libreria plotly, igualmente dejo una foto, si queres ver e interactuar con el grafico entra al proyecto en colab. # https://colab.research.google.com/drive/1FLDDs4Xr_NU6aDfS_FEWYLzclKWCmzBh?usp=sharing fig = px.choropleth(df_c, locations="pais_codigo", color="Fecha de factura", hover_name="País", color_continuous_scale=px.colors.sequential.Plasma) fig.show()

df["month_cohort"] = df["Fecha de factura"].apply(lambda x: x.month if x.year != 2020 else 0)

df_c = pd.pivot_table(df, index=["ID Cliente"], aggfunc="mean") df_c = df_c.reset_index()

X = df_c[["Cantidad", "Monto", "month", "month_cohort"]]

scaler = StandardScaler() scaler = scaler.fit(X)

X = scaler.transform(X)

pca = PCA(n_components=4) pca = pca.fit(X)

sns.lineplot(x=range(pca.n_components_), y=pca.explained_variance_ratio_)

pca = PCA(n_components=2) pca = pca.fit(X)

X = pca.transform(X)

X = pd.DataFrame(X)

sns.scatterplot(x=X[0], y=X[1])

wcss = [] for i in range(1, 11): kmeans = KMeans(n_clusters=i) kmeans = kmeans.fit(X) wcss.append(kmeans.inertia_)

sns.lineplot(x=range(1, 11), y=wcss)

kmeans = KMeans(n_clusters=3) kmeans = kmeans.fit(X)

df_c["group"] = kmeans.predict(X)

df_c

sns.scatterplot(x=X[0], y=X[1], hue=kmeans.predict(X))