%%bash
if test -f requirements.txt
then
sed -i '/jedi/d;/jupyter/d;' ./requirements.txt
pip install -r ./requirements.txt
else echo "All done my fammm"
fi
import pandas as pd
import tiktoken
from openai.embeddings_utils import get_embedding
import openai
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000
df_1 = pd.read_csv("reviews_1k.csv")
df = df_1[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
"Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)
top_n = 1000
df = df.sort_values("Time").tail(top_n * 2)
df.drop("Time", axis=1, inplace=True)
encoding = tiktoken.get_encoding(embedding_encoding)
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)
import os
openai.api_key = os.environ.get("OPENAI_KEY")
##df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))
##df.to_csv('embeddings_1k.csv')
import numpy as np
import pandas as pd
df_embedding = pd.read_csv("embeddings_1k.csv")
df_embedding["embedding"] = df_embedding.embedding.apply(eval).apply(np.array) # convert string to numpy array
matrix = np.vstack(df.embedding.values)
matrix.shape
from sklearn.cluster import KMeans
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_
df_embedding["Cluster"] = labels
df_embedding.groupby("Cluster").Score.mean().sort_values()
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot as plt
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
vis_dims2 = tsne.fit_transform(matrix)
x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]
for category, color in enumerate(["purple", "green", "red", "blue"]):
xs = np.array(x)[df_embedding.Cluster == category]
ys = np.array(y)[df_embedding.Cluster == category]
plt.scatter(xs, ys, color=color, alpha=0.3)
avg_x = xs.mean()
avg_y = ys.mean()
plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
plt.title("Clusters identified visualized in language 2d using t-SNE")
import openai
rev_per_cluster = 5
for i in range(n_clusters):
print(f"Cluster {i} Theme:", end=" ")
reviews = "\n".join(
df_embedding[df_embedding.Cluster == i]
.combined.str.replace("Title: ", "")
.str.replace("\n\nContent: ", ": ")
.sample(rev_per_cluster, random_state=42)
.values
)
response = openai.Completion.create(
engine="text-davinci-003",
prompt=f'¿Que tipo de informacion nos sirve para ayudar a mejorar nuestro producto?\n\nCustomer reviews:\n"""\n{reviews}\n"""\n\nTheme:',
temperature=0,
max_tokens=64,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
)
print(response["choices"][0]["text"].replace("\n", ""))
sample_cluster_rows = df_embedding[df_embedding.Cluster == i].sample(rev_per_cluster, random_state=42)
for j in range(rev_per_cluster):
print(sample_cluster_rows.Score.values[j], end=", ")
print(sample_cluster_rows.Summary.values[j], end=": ")
print(sample_cluster_rows.Text.str[:70].values[j])
print("-" * 100)