negBooksReviews = []
negDvdReviews = []
negElectronicsReviews = []
negKitchenHousewaresReviews = []
posBooksReviews = []
posDvdReviews = []
posElectronicsReviews = []
posKitchenHousewaresReviews = []
allDataset = [["sorted_data_acl/books/negative.review", negBooksReviews], ["sorted_data_acl/books/positive.review", posBooksReviews],
["sorted_data_acl/dvd/negative.review", negDvdReviews], ["sorted_data_acl/dvd/positive.review", posDvdReviews],
["sorted_data_acl/electronics/negative.review", negElectronicsReviews], ["sorted_data_acl/electronics/positive.review", posElectronicsReviews],
["sorted_data_acl/kitchen_&_housewares/negative.review", negKitchenHousewaresReviews], ["sorted_data_acl/kitchen_&_housewares/positive.review", posKitchenHousewaresReviews]]
def getRating(string):
    return string.split('</rating>')[0].split('<rating>')[1].strip().split('.')[0]
def getReviewText(string):
    return string.split('</review_text>')[0].split('<review_text>')[1].strip().replace('  ', ' ').replace('\n', '').lower()
def getData(fileName):
    finalData = []
    raw_data = open(fileName, "r")
    txt_data = raw_data.read()
    dataSplited = txt_data.split('</review>')
    for review in dataSplited:
        if (len(review) > 1):
            rating = getRating(review)
            reviewText = getReviewText(review)
            finalData.append([rating, reviewText])
    return finalData
for dataset in allDataset :
    dataset[1] = getData(dataset[0])
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = []
for review in allDataset[0][1]:
    corpus.append(review[1])
vectorizer = TfidfVectorizer()
tokens = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(tokens.shape)
import nltk
print(allDataset[0][1][0][1])
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
tokens = nltk.word_tokenize(allDataset[0][1][0][1])
print(tokens)
tagged = nltk.pos_tag(tokens)
print(tagged)
entities = nltk.chunk.ne_chunk(tagged)
print(entities)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from collections import Counter
    
def getTrainingDataset(datasets):
    trainingReviews = []
    trainingRatings = []
    trainingPolarities = []
    trainingDomains = []
    for dataset in datasets:
        for i in range(round(len(dataset[1]) * 0.8)):
            #if (i >= len(dataset[1]) * 0.8):
            #    break
            trainingReviews.append(dataset[1][i][1])
            trainingRatings.append(dataset[1][i][0])
            trainingDomains.append(dataset[0].split('/')[1].split('/')[0])
            if (dataset[1][i][0] == "1" or dataset[1][i][0] == "2"):
                trainingPolarities.append("negative")
            else:
                trainingPolarities.append("positive")
    return [trainingReviews, trainingRatings, trainingDomains, trainingPolarities]
def getTestDataset(datasets):
    testReviews = []
    testRatings = []
    testPolarities = []
    testDomains = []
    for dataset in datasets:
        for i in range(round(len(dataset[1]) * 0.8), len(dataset[1])):
            testReviews.append(dataset[1][i][1])
            testRatings.append(dataset[1][i][0])
            testDomains.append(dataset[0].split('/')[1].split('/')[0])
            if (dataset[1][i][0] == "1" or dataset[1][i][0] == "2"):
                testPolarities.append("negative")
            else:
                testPolarities.append("positive")
    return [testReviews, testRatings, testDomains, testPolarities]
def predict(trainingDataset, testDataset):
    vec = TfidfVectorizer(stop_words="english")
    clf = Perceptron()
    pipeline = Pipeline([
        ('vectorizer', vec),
        ('classifier', clf)
    ])
    pipeline.fit(trainingDataset[0], trainingDataset[1])
    testPrediction = pipeline.predict(testDataset[0])
    displayScore(testDataset[1], testPrediction)
def displayScore(testRatings, testPrediction):
    #print(confusion_matrix(testRatings, testPrediction))
    print(f"Accuracy: {accuracy_score(testRatings, testPrediction)}")
    #print(f"Precision: {precision_score(testRatings, testPrediction, average='micro')}")
    #print(f"Recall: {recall_score(testRatings, testPrediction, average='micro')}")
    #print(f"F-score: {f1_score(testRatings, testPrediction, average='micro')}")
    print()
def predictAllPossibilties(trainingDataset):
    print("Test with allDataset")
    testDataset = getTestDataset(allDataset)
    print("Ratings : ")
    predict([trainingDataset[0], trainingDataset[1]], [testDataset[0], testDataset[1]])
    if (len(Counter(trainingDataset[2]).keys()) > 1):
        print("Domains : ")
        predict([trainingDataset[0], trainingDataset[2]], [testDataset[0], testDataset[2]])
    if (len(Counter(trainingDataset[3]).keys()) > 1):
        print("Polarities : ")
        predict([trainingDataset[0], trainingDataset[3]], [testDataset[0], testDataset[3]])
    for dataset in allDataset:
        print("Test with ", dataset[0])
        testDataset = getTestDataset([dataset])
        #if not (len(Counter(trainingDataset[3]).keys()) == 1 and len(Counter(testDataset[3]).keys()) == 1 and trainingDataset[3][0] != testDataset[3][0]):
        print("Ratings : ")
        predict([trainingDataset[0], trainingDataset[1]], [testDataset[0], testDataset[1]])
        if (len(Counter(trainingDataset[2]).keys()) > 1):
            print("Domains : ")
            predict([trainingDataset[0], trainingDataset[2]], [testDataset[0], testDataset[2]])
        if (len(Counter(trainingDataset[3]).keys()) > 1):
            print("Polarities : ")
            predict([trainingDataset[0], trainingDataset[3]], [testDataset[0], testDataset[3]])
    for dataset in range(0, len(allDataset) - 1, 2):
        print("Test with ", allDataset[dataset][0], " and ", allDataset[dataset + 1][0])
        testDataset = getTestDataset([allDataset[dataset], allDataset[dataset + 1]])
        print("Ratings : ")
        predict([trainingDataset[0], trainingDataset[1]], [testDataset[0], testDataset[1]])
        if (len(Counter(trainingDataset[2]).keys()) > 1):
            print("Domains : ")
            predict([trainingDataset[0], trainingDataset[2]], [testDataset[0], testDataset[2]])
        if (len(Counter(trainingDataset[3]).keys()) > 1):
            print("Polarities : ")
            predict([trainingDataset[0], trainingDataset[3]], [testDataset[0], testDataset[3]])
print("Train with allDataset")
print("---------------------------------------------------")
trainingDataset = getTrainingDataset(allDataset)
predictAllPossibilties(trainingDataset)
for datasetTraining in allDataset:
    print("---------------------------------------------------")
    print("Train with ", datasetTraining[0])
    print("---------------------------------------------------")
    trainingDataset = getTrainingDataset([datasetTraining])
    
    predictAllPossibilties(trainingDataset)
for dataset in range(0, len(allDataset) - 1, 2):
    print("---------------------------------------------------")
    print("Train with ", allDataset[dataset][0], " and ", allDataset[dataset + 1][0])
    print("---------------------------------------------------")
    trainingDataset = getTrainingDataset([allDataset[dataset], allDataset[dataset + 1]])
    predictAllPossibilties(trainingDataset)