%%bash
if test -f requirements.txt
  then
    sed -i '/jedi/d;/jupyter/d;' ./requirements.txt
    pip install -r ./requirements.txt
  else echo "All done my fammm"
fi
import pandas as pd
import tiktoken
from openai.embeddings_utils import get_embedding
import openai
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000
df_1 = pd.read_csv("reviews_1k.csv")
df = df_1[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)
top_n = 1000
df = df.sort_values("Time").tail(top_n * 2)
df.drop("Time", axis=1, inplace=True)
encoding = tiktoken.get_encoding(embedding_encoding)
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)
import os
openai.api_key = os.environ.get("OPENAI_KEY")
##df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))
##df.to_csv('embeddings_1k.csv')
import numpy as np
import pandas as pd
df_embedding = pd.read_csv("embeddings_1k.csv")
df_embedding["embedding"] = df_embedding.embedding.apply(eval).apply(np.array)  # convert string to numpy array
matrix = np.vstack(df.embedding.values)
matrix.shape
from sklearn.cluster import KMeans
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_
df_embedding["Cluster"] = labels
df_embedding.groupby("Cluster").Score.mean().sort_values()
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot as plt
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
vis_dims2 = tsne.fit_transform(matrix)
x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]
for category, color in enumerate(["purple", "green", "red", "blue"]):
    xs = np.array(x)[df_embedding.Cluster == category]
    ys = np.array(y)[df_embedding.Cluster == category]
    plt.scatter(xs, ys, color=color, alpha=0.3)
    avg_x = xs.mean()
    avg_y = ys.mean()
    plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
plt.title("Clusters identified visualized in language 2d using t-SNE")
import openai
rev_per_cluster = 5
for i in range(n_clusters):
    print(f"Cluster {i} Theme:", end=" ")
    reviews = "\n".join(
        df_embedding[df_embedding.Cluster == i]
        .combined.str.replace("Title: ", "")
        .str.replace("\n\nContent: ", ":  ")
        .sample(rev_per_cluster, random_state=42)
        .values
    )
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=f'¿Que tipo de informacion nos sirve para ayudar a mejorar nuestro producto?\n\nCustomer reviews:\n"""\n{reviews}\n"""\n\nTheme:',
        temperature=0,
        max_tokens=64,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    print(response["choices"][0]["text"].replace("\n", ""))
    sample_cluster_rows = df_embedding[df_embedding.Cluster == i].sample(rev_per_cluster, random_state=42)
    for j in range(rev_per_cluster):
        print(sample_cluster_rows.Score.values[j], end=", ")
        print(sample_cluster_rows.Summary.values[j], end=":   ")
        print(sample_cluster_rows.Text.str[:70].values[j])
    print("-" * 100)