import anvil.server
anvil.server.connect("6UM23VOV3QVURHBZIP2T34RY-7ONIKWBQIBLJ6MZA")
Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment (dev)" as SERVER
import requests
import anvil.media
@anvil.server.callable
def get_reviews(appid, params={'json':1}):
url = 'https://store.steampowered.com/appreviews/'
response = requests.get(url=url+appid, params=params, headers={'User-Agent': 'Mozilla/5.0'})
return response.json()
import pandas as pd
import anvil.media
df_selected_column = pd.DataFrame()
@anvil.server.callable
def get_n_reviews(appid, n):
global df_selected_column
appid = str(appid)
reviews = []
cursor = '*'
params = {
'json' : 1,
'filter' : 'all',
'language' : 'english',
'day_range' : 9223372036854775807,
'review_type' : 'all',
'purchase_type' : 'all'
}
while n > 0:
params['cursor'] = cursor.encode()
params['num_per_page'] = min(100, n)
n -= 100
response = get_reviews(appid, params)
cursor = response['cursor']
reviews += response['reviews']
if len(response['reviews']) < 100: break
df_all_column = pd.json_normalize(reviews, sep='_')
df_selected_column = df_all_column[['review','voted_up','votes_up','author_playtime_at_review']]
import matplotlib.pyplot as plt # our main display package
import string # used for preprocessing
import re # used for preprocessing
import nltk # the Natural Language Toolkit, used for preprocessing
import numpy as np # used for managing NaNs
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords # used for preprocessing
from nltk.stem import WordNetLemmatizer # used for preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split # for splitting dataset into training and test
from sklearn.preprocessing import MinMaxScaler # for normalizing values
from sklearn import svm # Support Vector Machine model
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from textblob import TextBlob # to determine subjectivity and polarity level
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
def remove_urls(text):
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
return new_text
# make all text lowercase
def text_lowercase(text):
return text.lower()
# remove numbers
def remove_numbers(text):
result = re.sub(r'\d+', '', text)
return result
# remove punctuation
def remove_punctuation(text):
translator = str.maketrans('', '', string.punctuation)
return text.translate(translator)
# tokenize
def tokenize(text):
text = word_tokenize(text)
return text
# remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
text = [i for i in text if not i in stop_words]
return text
# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
text = [lemmatizer.lemmatize(token) for token in text]
return text
def preprocessing(text):
text = text_lowercase(text)
text = remove_urls(text)
text = remove_numbers(text)
text = remove_punctuation(text)
text = tokenize(text)
text = remove_stopwords(text)
text = lemmatize(text)
text = ' '.join(text)
return text
@anvil.server.callable
def process_text():
pp_text_train = [] # kolom teks pre-proses
for text_data in df_selected_column['review']:
pp_text_data = preprocessing(text_data)
pp_text_train.append(pp_text_data)
df_selected_column['pp_text'] = pp_text_train # tambahkan kolom teks pre-proses kedalam dataset
@anvil.server.callable
def polarity():
text_polarity = []
for text_data in df_selected_column['pp_text']:
text = TextBlob(text_data).polarity
text_polarity.append(text)
df_selected_column['polarity'] = text_polarity
@anvil.server.callable
def subjectivity():
text_subjectivity = []
for text_data in df_selected_column['pp_text']:
text = TextBlob(text_data).subjectivity
text_subjectivity.append(text)
df_selected_column['subjectivity'] = text_subjectivity
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
scaler = MinMaxScaler()
@anvil.server.callable
def normalize():
df_selected_column['author_playtime_at_review_normalized'] = scaler.fit_transform(df_selected_column[['author_playtime_at_review']])
df_selected_column['review_votes_up_normalized'] = scaler.fit_transform(df_selected_column[['votes_up']])
df_selected_column['review_length'] = df_selected_column['review'].apply(len)
df_selected_column['review_length_normalized'] = scaler.fit_transform(df_selected_column[['review_length']])
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import pickle
df_final = pd.DataFrame()
@anvil.server.callable
def term_weighing():
global df_final
token = {}
vectorizer = pickle.load(open('tf-idf.pkl','rb'))
token = df_selected_column['pp_text'].to_dict()
token = list(token.values())
X = vectorizer.transform(token)
feature_names = vectorizer.get_feature_names()
dense = X.todense()
denselist = dense.tolist()
dfvectorized = pd.DataFrame(denselist, columns=feature_names)
df_final = pd.concat([dfvectorized, df_selected_column.drop(['review', 'voted_up', 'votes_up', 'author_playtime_at_review', 'pp_text', 'review_length'], axis=1)], axis=1, join='inner')
import pickle
@anvil.server.callable
def classify():
model = pickle.load(open('review_prediction_model.pkl','rb'))
prediction = model.predict(df_final)
df_selected_column['prediction'] = prediction
return df_selected_column.to_dict('records')
@anvil.server.callable
def clear_df():
global df_selected_column
df_selected_column = df_selected_column.iloc[0:0]
global df_final
df_final = df_final.iloc[0:0]
# get_n_reviews(287700, 100)
# df_selected_column
# df_selected_column = df_selected_column.iloc[0:0]
# process_text()
# del df_selected_column
# df_selected_column
# polarity()
# subjectivity()
# normalize()
# df_selected_column
# term_weighing()
# df_final
# classify()