negBooksReviews = []
negDvdReviews = []
negElectronicsReviews = []
negKitchenHousewaresReviews = []
posBooksReviews = []
posDvdReviews = []
posElectronicsReviews = []
posKitchenHousewaresReviews = []
allDataset = [["sorted_data_acl/books/negative.review", negBooksReviews], ["sorted_data_acl/books/positive.review", posBooksReviews],
["sorted_data_acl/dvd/negative.review", negDvdReviews], ["sorted_data_acl/dvd/positive.review", posDvdReviews],
["sorted_data_acl/electronics/negative.review", negElectronicsReviews], ["sorted_data_acl/electronics/positive.review", posElectronicsReviews],
["sorted_data_acl/kitchen_&_housewares/negative.review", negKitchenHousewaresReviews], ["sorted_data_acl/kitchen_&_housewares/positive.review", posKitchenHousewaresReviews]]
def getRating(string):
return string.split('</rating>')[0].split('<rating>')[1].strip().split('.')[0]
def getReviewText(string):
return string.split('</review_text>')[0].split('<review_text>')[1].strip().replace(' ', ' ').replace('\n', '').lower()
def getData(fileName):
finalData = []
raw_data = open(fileName, "r")
txt_data = raw_data.read()
dataSplited = txt_data.split('</review>')
for review in dataSplited:
if (len(review) > 1):
rating = getRating(review)
reviewText = getReviewText(review)
finalData.append([rating, reviewText])
return finalData
for dataset in allDataset :
dataset[1] = getData(dataset[0])
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = []
for review in allDataset[0][1]:
corpus.append(review[1])
vectorizer = TfidfVectorizer()
tokens = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(tokens.shape)
import nltk
print(allDataset[0][1][0][1])
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
tokens = nltk.word_tokenize(allDataset[0][1][0][1])
print(tokens)
tagged = nltk.pos_tag(tokens)
print(tagged)
entities = nltk.chunk.ne_chunk(tagged)
print(entities)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from collections import Counter
def getTrainingDataset(datasets):
trainingReviews = []
trainingRatings = []
trainingPolarities = []
trainingDomains = []
for dataset in datasets:
for i in range(round(len(dataset[1]) * 0.8)):
#if (i >= len(dataset[1]) * 0.8):
# break
trainingReviews.append(dataset[1][i][1])
trainingRatings.append(dataset[1][i][0])
trainingDomains.append(dataset[0].split('/')[1].split('/')[0])
if (dataset[1][i][0] == "1" or dataset[1][i][0] == "2"):
trainingPolarities.append("negative")
else:
trainingPolarities.append("positive")
return [trainingReviews, trainingRatings, trainingDomains, trainingPolarities]
def getTestDataset(datasets):
testReviews = []
testRatings = []
testPolarities = []
testDomains = []
for dataset in datasets:
for i in range(round(len(dataset[1]) * 0.8), len(dataset[1])):
testReviews.append(dataset[1][i][1])
testRatings.append(dataset[1][i][0])
testDomains.append(dataset[0].split('/')[1].split('/')[0])
if (dataset[1][i][0] == "1" or dataset[1][i][0] == "2"):
testPolarities.append("negative")
else:
testPolarities.append("positive")
return [testReviews, testRatings, testDomains, testPolarities]
def predict(trainingDataset, testDataset):
vec = TfidfVectorizer(stop_words="english")
clf = Perceptron()
pipeline = Pipeline([
('vectorizer', vec),
('classifier', clf)
])
pipeline.fit(trainingDataset[0], trainingDataset[1])
testPrediction = pipeline.predict(testDataset[0])
displayScore(testDataset[1], testPrediction)
def displayScore(testRatings, testPrediction):
#print(confusion_matrix(testRatings, testPrediction))
print(f"Accuracy: {accuracy_score(testRatings, testPrediction)}")
#print(f"Precision: {precision_score(testRatings, testPrediction, average='micro')}")
#print(f"Recall: {recall_score(testRatings, testPrediction, average='micro')}")
#print(f"F-score: {f1_score(testRatings, testPrediction, average='micro')}")
print()
def predictAllPossibilties(trainingDataset):
print("Test with allDataset")
testDataset = getTestDataset(allDataset)
print("Ratings : ")
predict([trainingDataset[0], trainingDataset[1]], [testDataset[0], testDataset[1]])
if (len(Counter(trainingDataset[2]).keys()) > 1):
print("Domains : ")
predict([trainingDataset[0], trainingDataset[2]], [testDataset[0], testDataset[2]])
if (len(Counter(trainingDataset[3]).keys()) > 1):
print("Polarities : ")
predict([trainingDataset[0], trainingDataset[3]], [testDataset[0], testDataset[3]])
for dataset in allDataset:
print("Test with ", dataset[0])
testDataset = getTestDataset([dataset])
#if not (len(Counter(trainingDataset[3]).keys()) == 1 and len(Counter(testDataset[3]).keys()) == 1 and trainingDataset[3][0] != testDataset[3][0]):
print("Ratings : ")
predict([trainingDataset[0], trainingDataset[1]], [testDataset[0], testDataset[1]])
if (len(Counter(trainingDataset[2]).keys()) > 1):
print("Domains : ")
predict([trainingDataset[0], trainingDataset[2]], [testDataset[0], testDataset[2]])
if (len(Counter(trainingDataset[3]).keys()) > 1):
print("Polarities : ")
predict([trainingDataset[0], trainingDataset[3]], [testDataset[0], testDataset[3]])
for dataset in range(0, len(allDataset) - 1, 2):
print("Test with ", allDataset[dataset][0], " and ", allDataset[dataset + 1][0])
testDataset = getTestDataset([allDataset[dataset], allDataset[dataset + 1]])
print("Ratings : ")
predict([trainingDataset[0], trainingDataset[1]], [testDataset[0], testDataset[1]])
if (len(Counter(trainingDataset[2]).keys()) > 1):
print("Domains : ")
predict([trainingDataset[0], trainingDataset[2]], [testDataset[0], testDataset[2]])
if (len(Counter(trainingDataset[3]).keys()) > 1):
print("Polarities : ")
predict([trainingDataset[0], trainingDataset[3]], [testDataset[0], testDataset[3]])
print("Train with allDataset")
print("---------------------------------------------------")
trainingDataset = getTrainingDataset(allDataset)
predictAllPossibilties(trainingDataset)
for datasetTraining in allDataset:
print("---------------------------------------------------")
print("Train with ", datasetTraining[0])
print("---------------------------------------------------")
trainingDataset = getTrainingDataset([datasetTraining])
predictAllPossibilties(trainingDataset)
for dataset in range(0, len(allDataset) - 1, 2):
print("---------------------------------------------------")
print("Train with ", allDataset[dataset][0], " and ", allDataset[dataset + 1][0])
print("---------------------------------------------------")
trainingDataset = getTrainingDataset([allDataset[dataset], allDataset[dataset + 1]])
predictAllPossibilties(trainingDataset)