select * from '/datasets/built-file/built_int_export.csv'
select
df_1.reviewid
,df_1.content
,case when df_1.score > 5.9 then 'positive' else
(case when df_1.score < 5.0 then 'negative' else 'neutral' end) end as category
from df_1
import nltk
import random
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# set your stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
nltk.download('punkt');
# Train Naive Bayes classifier
train_set = [(document_features(d), c) for (d,c) in document]
classifier = nltk.NaiveBayesClassifier.train(train_set)
# Tokenize test set
df_test['content'].dropna(inplace=True)
df_test['content'] = df_test['content'].astype(str)
def tokenize(column):
tokens = nltk.word_tokenize(column)
return [w for w in tokens if w.isalpha()]
df_test['tokenized'] = df_test.apply(lambda x: tokenize(x['content']),axis=1)
df_test[['tokenized']].head()
# Format test set for running through classifier
df_test=df_test.sample(frac=1)
df_test['tuples_column'] = list(zip((df_test.tokenized), df_test.category))
document2 = df_test['tuples_column'].values.tolist()
test_temp=[[y.lower() for y in sublst[0]] for sublst in document]
test_temp2=[sublst[1] for sublst in document]
document2=tuple(zip((test_temp), test_temp2))
test_set = [(document_features(d), c) for (d,c) in document2]
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)