import pandas as pd
import seaborn as sns
import numpy as np
import sys
from plotly import express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
df = pd.read_csv("/content/sample_data/ventas-por-factura.csv")
df
df.dtypes
df.describe()
df.info()
df.isnull().sum()
df_i_c = pd.read_csv("/content/sample_data/countries_codes_and_coordinates.csv")
df_i_c
df = df.dropna(axis="rows")
df["Fecha de factura"] = df["Fecha de factura"].apply(lambda x: pd.to_datetime(x))
def n_p(x):
if "," in x:
return float(x.replace(",", ""))
return float(x)
df["Cantidad"] = df["Cantidad"].apply(lambda x: n_p(x) if n_p(x) > 0 else None)
df = df.dropna(axis="rows")
df["Cantidad"]
df["Monto"] = df["Monto"].apply(lambda x: n_p(x) if n_p(x) > 0 else None)
df = df.dropna(axis="rows")
df["month"] = df["Fecha de factura"].apply(lambda x: (x.month) if x.year != 2020 else 0)
df["year"] = df["Fecha de factura"].apply(lambda x: x.year)
df_t = pd.pivot_table(df, index=["month", "year"], aggfunc="mean")
df_t = df_t.reset_index()
sns.set(rc={"figure.figsize": (15, 9)})
sns.barplot(data=df_t,x="month", y="Monto")
df_t["Monto"].mean()
df_n = df[df["País"] != "United Kingdom"]
df.dtypes
df["ID Cliente"] = df["ID Cliente"].astype(str)
df
df_t = pd.pivot_table(df, index=["month"], aggfunc="count")
df_t = df_t.reset_index()
sns.lineplot(x=df_t["month"], y=df_t["Fecha de factura"])
df_t["ID Cliente"].mean()
df_c = pd.pivot_table(df, index=["ID Cliente"], aggfunc="sum")
df_c = df_c.reset_index()
df_c = df_c.sort_values(by="Monto", ascending=False)
sns.barplot(x=df_c["ID Cliente"].head(10), y=df_c["Monto"].head(10))
df_c = pd.pivot_table(df, index=["ID Cliente"], aggfunc="sum")
df_c = df_c.reset_index()
df_c = df_c.sort_values(by="Cantidad", ascending=False)
sns.barplot(x=df_c["ID Cliente"].head(10), y=df_c["Cantidad"].head(10))
df_c
df_c = pd.pivot_table(df, index=["ID Cliente"], aggfunc="mean")
df_c = df_c.reset_index()
df_c
df_c["Monto"].mean()
df_n = df.sort_values(by="Fecha de factura")
df_n = df_n[["Fecha de factura", "ID Cliente"]].drop_duplicates(subset=["ID Cliente"])
df
df_n["month_cohort"] = df["Fecha de factura"].apply(lambda x: x.month if x.year != 2020 else 0)
df_n
df_t = df.sort_values(by="Fecha de factura")
df_n = df_n.pivot_table(df_n, index=["ID Cliente"])
df_c = pd.merge(right=df, left=df_n, on="ID Cliente")
df_c[df_c["ID Cliente"] == "14646.0"]
df_c = pd.pivot_table(df_c, index=["month_cohort"],columns=["month"],aggfunc="count")
df_c["Cantidad"]
sns.heatmap(df_c["Fecha de factura"], annot=True, fmt=".2f")
df["pais_codigo"] = df["País"].apply(lambda x: str(df_i_c[df_i_c["Country"] == x]["Alpha-3 code"]).replace('"', "").split(" ")[5].split("\n")[0])
df_c = df.pivot_table(df, index=["pais_codigo", "País"], aggfunc="count")
df_c = df_c.reset_index()
df_c["Cantidad"]
df
# El grafico no puede verse por github ya que no soporta la libreria plotly, igualmente dejo una foto, si queres ver e interactuar con el grafico entra al proyecto en colab.
# https://colab.research.google.com/drive/1FLDDs4Xr_NU6aDfS_FEWYLzclKWCmzBh?usp=sharing
fig = px.choropleth(df_c, locations="pais_codigo",
color="Fecha de factura",
hover_name="País",
color_continuous_scale=px.colors.sequential.Plasma)
fig.show()
df
df["month_cohort"] = df["Fecha de factura"].apply(lambda x: x.month if x.year != 2020 else 0)
df_c = pd.pivot_table(df, index=["ID Cliente"], aggfunc="mean")
df_c = df_c.reset_index()
X = df_c[["Cantidad", "Monto", "month", "month_cohort"]]
scaler = StandardScaler()
scaler = scaler.fit(X)
X = scaler.transform(X)
X
pca = PCA(n_components=4)
pca = pca.fit(X)
sns.lineplot(x=range(pca.n_components_), y=pca.explained_variance_ratio_)
pca = PCA(n_components=2)
pca = pca.fit(X)
X = pca.transform(X)
X = pd.DataFrame(X)
sns.scatterplot(x=X[0], y=X[1])
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i)
kmeans = kmeans.fit(X)
wcss.append(kmeans.inertia_)
sns.lineplot(x=range(1, 11), y=wcss)
kmeans = KMeans(n_clusters=3)
kmeans = kmeans.fit(X)
df_c["group"] = kmeans.predict(X)
df_c
sns.scatterplot(x=X[0], y=X[1], hue=kmeans.predict(X))