data = pd.read_csv("IMDB Dataset.csv")
data = data.sample(len(data)).reset_index(drop=True)
# Strip "<br />" tags and convert to lowercase
data["review"] = data["review"].apply(lambda x: x.replace("<br />", " ").lower())
# Strip punctuation
data["review"] = data["review"].apply(lambda x: re.sub(f"[{string.punctuation}]", "", x))
# Get top 30 most common characters
chars = "".join(pd.Series(list(" ".join(data["review"].to_list()))).value_counts().keys()[:30])
# Remove everything except the top 30 most common characters
data["review"] = data["review"].apply(lambda x: re.sub(f"[^{chars}]", "", x))
from sklearn.preprocessing import OrdinalEncoder
data[["sentiment"]] = OrdinalEncoder().fit_transform(data[["sentiment"]])
print(data.head())
Run the app to see the outputs
Press the run button in the top right corner