ML - NLP

import pandas as pd from sklearn.model_selection import train_test_split import seaborn as sns import numpy as np from sklearn.dummy import DummyClassifier from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import Perceptron from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import HashingVectorizer from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC

# PARSE DATA books_reviews = [] dvd_reviews = [] electronics_reviews = [] kitchen_reviews = [] f = open("data/reviews.txt", "r") data = f.read().splitlines() review = {'rating': '', 'text': '', 'domain': ''} for x in range(len(data)): if data[x] == '<rating>': review['rating'] = data[x+1] if data[x] == '<review_text>': review['text'] = data[x+1] if data[x] == '<product_type>': review['domain'] = data[x+1] if data[x] == '<unique_id>' and review != {'rating': '', 'text': '', 'domain': ''}: if review['domain'] == 'books': books_reviews.append(review) if review['domain'] == 'dvd': dvd_reviews.append(review) if review['domain'] == 'electronics': electronics_reviews.append(review) if review['domain'] == 'kitchen & housewares': kitchen_reviews.append(review) review = {'rating': '', 'text': '', 'domain': ''}

import pandas as pd def transform_to_dataframe(reviews): return pd.DataFrame(data={'rating': list(map(lambda x: x["rating"], reviews)), 'text': list(map(lambda x: x["text"], reviews)), 'domain': reviews[0]["domain"], 'polarity': list(map(lambda x: "positive" if x["rating"] == "4.0" or x['rating'] == "5.0" else "negative", reviews))}) books_reviews = transform_to_dataframe(books_reviews) dvd_reviews = transform_to_dataframe(dvd_reviews) electronics_reviews = transform_to_dataframe(electronics_reviews) kitchen_reviews = transform_to_dataframe(kitchen_reviews)

mixed = pd.concat([books_reviews, dvd_reviews, electronics_reviews, kitchen_reviews]).reset_index(drop=True) mixed['rating'] = pd.Categorical(mixed['rating'], categories=['1.0', '2.0', '4.0', '5.0']) ax = sns.displot(mixed, x="rating", multiple='dodge', hue='domain')

books_reviews = books_reviews.sample(frac=1, random_state=42).reset_index(drop=True) dvd_reviews = dvd_reviews.sample(frac=1, random_state=42).reset_index(drop=True) electronics_reviews = electronics_reviews.sample(frac=1, random_state=42).reset_index(drop=True) kitchen_reviews = kitchen_reviews.sample(frac=1, random_state=42).reset_index(drop=True) dvd_reviews.head()

from sklearn.model_selection import train_test_split books_train, books_test = train_test_split(books_reviews, test_size=0.2, random_state=42) dvd_train, dvd_test = train_test_split(dvd_reviews, test_size=0.2, random_state=42) electronics_train, electronics_test = train_test_split(electronics_reviews, test_size=0.2, random_state=42) kitchen_train, kitchen_test = train_test_split(kitchen_reviews, test_size=0.2, random_state=42)

mixed_train = pd.concat([books_train, dvd_train, electronics_train, kitchen_train]) mixed_train = mixed_train.sample(frac=1, random_state=42).reset_index(drop=True) mixed_train.head()

mixed_test = pd.concat([books_test, dvd_test, electronics_test, kitchen_test]) mixed_test = mixed_test.sample(frac=1, random_state=42).reset_index(drop=True) mixed_test.head()

import seaborn as sns def distribution_table(dataset): ax = sns.histplot(data=dataset, x="rating") distribution_table(mixed_train) distribution_table(mixed_test)

import numpy as np unique, amount = np.unique(mixed_test["rating"].to_numpy(), return_counts=True) counts = dict(zip(unique, amount)) print(counts) sum = 0 for x in counts.values(): sum += x tp = counts['5.0'] / sum print(tp)

precision = tp / (tp + (1 - tp)) # FP is (1 - TP) in this case recall = tp / (tp + 0) f_score = (2 * precision * recall) / (precision + recall) print(f"precision: {precision}, recall: {recall}, f_score: {f_score}")

from sklearn.dummy import DummyClassifier X = books_train['text'] # list(map(lambda x : x["text"], books_train)) y = books_train['rating'] # list(map(lambda x : x["rating"], books_train)) dummy_clf = DummyClassifier(strategy="stratified") dummy_clf.fit(X, y) y_pred = dummy_clf.predict(books_test['text'])

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix def print_scores(test_Y, test_Y_pred): print(f"Confusion matrix:\n {confusion_matrix(test_Y, test_Y_pred)}") print(f"Accuracy: {accuracy_score(test_Y, test_Y_pred)}") print(f"Precision: {precision_score(test_Y, test_Y_pred, average='macro')}") print(f"Recall: {recall_score(test_Y, test_Y_pred, average='macro')}") print(f"F-score: {f1_score(test_Y, test_Y_pred, average='macro')}") print_scores(books_test['rating'], y_pred)

train_X = mixed_train['text'] train_Y = mixed_train['polarity'] test_X = mixed_test['text'] test_Y = mixed_test['polarity']

from sklearn.pipeline import Pipeline def run_classifier_pipe(vec, clf, train_X, train_Y, test_X): pipeline = Pipeline([ ('vectorizer', vec), ('classifier', clf) ]) pipeline.fit(train_X, train_Y) return pipeline.predict(test_X)

from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import Perceptron vec = TfidfVectorizer(stop_words="english") clf = Perceptron() test_Y_pred_Tfidf = run_classifier_pipe(vec, clf, train_X, train_Y, test_X) print_scores(test_Y, test_Y_pred_Tfidf)

from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer() clf = Perceptron() test_Y_pred_counts = run_classifier_pipe(vec, clf, train_X, train_Y, test_X) print_scores(test_Y, test_Y_pred_counts)

vec = CountVectorizer(ngram_range=(4, 4)) clf = Perceptron() test_Y_pred_counts_ng44 = run_classifier_pipe(vec, clf, train_X, train_Y, test_X) print_scores(test_Y, test_Y_pred_counts_ng44)

vec = CountVectorizer(analyzer='word', ngram_range=(2, 4), lowercase=False) clf = Perceptron() test_Y_pred_counts_ng24 = run_classifier_pipe(vec, clf, train_X, train_Y, test_X) print_scores(test_Y, test_Y_pred_counts_ng24)

vec = CountVectorizer(analyzer='char', ngram_range=(2, 10)) clf = Perceptron() test_Y_pred_counts_char = run_classifier_pipe(vec, clf, train_X, train_Y, test_X) print_scores(test_Y, test_Y_pred_counts_char)

vec = CountVectorizer(analyzer='char_wb', ngram_range=(2, 10)) clf = Perceptron() test_Y_pred_counts_wbchar = run_classifier_pipe(vec, clf, train_X, train_Y, test_X) print_scores(test_Y, test_Y_pred_counts_wbchar)

from sklearn.feature_extraction.text import HashingVectorizer vec = HashingVectorizer() clf = Perceptron() test_Y_pred_hashing = run_classifier_pipe(vec, clf, train_X, train_Y, test_X) print_scores(test_Y, test_Y_pred_hashing)

from sklearn.feature_extraction.text import HashingVectorizer vec = HashingVectorizer(n_features=2**4) clf = Perceptron() test_Y_pred_hashing_2p4 = run_classifier_pipe(vec, clf, train_X, train_Y, test_X) print_scores(test_Y, test_Y_pred_hashing_2p4)

from sklearn.feature_extraction.text import HashingVectorizer vec = HashingVectorizer(n_features=(2**31 - 1)) clf = Perceptron() test_Y_pred_hashing_2p31 = run_classifier_pipe(vec, clf, train_X, train_Y, test_X) print_scores(test_Y, test_Y_pred_hashing_2p31)

overviewtable = pd.DataFrame(data={ 'TF-IDF': [ round(accuracy_score(test_Y, test_Y_pred_Tfidf), 4), round(precision_score(test_Y, test_Y_pred_Tfidf, average='macro'), 4), round(recall_score(test_Y, test_Y_pred_Tfidf, average='macro'), 4), round(f1_score(test_Y, test_Y_pred_Tfidf, average='macro'), 4) ] , 'Counts': [ round(accuracy_score(test_Y, test_Y_pred_counts), 4), round(precision_score(test_Y, test_Y_pred_counts, average='macro'), 4), round(recall_score(test_Y, test_Y_pred_counts, average='macro'), 4), round(f1_score(test_Y, test_Y_pred_counts, average='macro'), 4) ], 'Counts ngrams=4,4': [ round(accuracy_score(test_Y, test_Y_pred_counts_ng44), 4), round(precision_score(test_Y, test_Y_pred_counts_ng44, average='macro'), 4), round(recall_score(test_Y, test_Y_pred_counts_ng44, average='macro'), 4), round(f1_score(test_Y, test_Y_pred_counts_ng44, average='macro'), 4) ], 'Counts ngrams=2,4': [ round(accuracy_score(test_Y, test_Y_pred_counts_ng24), 4), round(precision_score(test_Y, test_Y_pred_counts_ng24, average='macro'), 4), round(recall_score(test_Y, test_Y_pred_counts_ng24, average='macro'), 4), round(f1_score(test_Y, test_Y_pred_counts_ng24, average='macro'), 4) ], 'Counts char': [ round(accuracy_score(test_Y, test_Y_pred_counts_char), 4), round(precision_score(test_Y, test_Y_pred_counts_char, average='macro'), 4), round(recall_score(test_Y, test_Y_pred_counts_char, average='macro'), 4), round(f1_score(test_Y, test_Y_pred_counts_char, average='macro'), 4) ], 'Counts wb_char': [ round(accuracy_score(test_Y, test_Y_pred_counts_wbchar), 4), round(precision_score(test_Y, test_Y_pred_counts_wbchar, average='macro'), 4), round(recall_score(test_Y, test_Y_pred_counts_wbchar, average='macro'), 4), round(f1_score(test_Y, test_Y_pred_counts_wbchar, average='macro'), 4) ], 'Hashing': [ round(accuracy_score(test_Y, test_Y_pred_hashing), 4), round(precision_score(test_Y, test_Y_pred_hashing, average='macro'), 4), round(recall_score(test_Y, test_Y_pred_hashing, average='macro'), 4), round(f1_score(test_Y, test_Y_pred_hashing, average='macro'), 4) ], 'Hashing 2^4': [ round(accuracy_score(test_Y, test_Y_pred_hashing_2p4), 4), round(precision_score(test_Y, test_Y_pred_hashing_2p4, average='macro'), 4), round(recall_score(test_Y, test_Y_pred_hashing_2p4, average='macro'), 4), round(f1_score(test_Y, test_Y_pred_hashing_2p4, average='macro'), 4) ], 'Hashing 2^31-1': [ round(accuracy_score(test_Y, test_Y_pred_hashing_2p31), 4), round(precision_score(test_Y, test_Y_pred_hashing_2p31, average='macro'), 4), round(recall_score(test_Y, test_Y_pred_hashing_2p31, average='macro'), 4), round(f1_score(test_Y, test_Y_pred_hashing_2p31, average='macro'), 4) ] }, index=['Accuracy', 'Precision', 'Recall', 'F-score']) overviewtable.transpose().head(10)

train_books_X = books_train['text'] train_books_Y = books_train['polarity'] test_books_X = books_test['text'] test_books_Y = books_test['polarity'] train_dvd_X = dvd_train['text'] train_dvd_Y = dvd_train['polarity'] test_dvd_X = dvd_test['text'] test_dvd_Y = dvd_test['polarity'] train_electronics_X = electronics_train['text'] train_electronics_Y = electronics_train['polarity'] test_electronics_X = electronics_test['text'] test_electronics_Y = electronics_test['polarity'] train_kitchen_X = kitchen_train['text'] train_kitchen_Y = kitchen_train['polarity'] test_kitchen_X = kitchen_test['text'] test_kitchen_Y = kitchen_test['polarity']

from sklearn.linear_model import Perceptron from sklearn.feature_extraction.text import CountVectorizer books_clf = Perceptron() dvd_clf = Perceptron() electronics_clf = Perceptron() kitchen_clf = Perceptron() vec = CountVectorizer()

books_Y_pred = run_classifier_pipe(vec, books_clf, train_books_X, train_books_Y, test_books_X) print_scores(test_books_Y, books_Y_pred)

dvd_Y_pred = run_classifier_pipe(vec, dvd_clf, train_dvd_X, train_dvd_Y, test_dvd_X) print_scores(test_dvd_Y, dvd_Y_pred)

electronics_Y_pred = run_classifier_pipe(vec, electronics_clf, train_electronics_X, train_electronics_Y, test_electronics_X) print_scores(test_electronics_Y, electronics_Y_pred)

kitchen_Y_pred = run_classifier_pipe(vec, kitchen_clf, train_kitchen_X, train_kitchen_Y, test_kitchen_X) print_scores(test_kitchen_Y, kitchen_Y_pred)

books_clf_iter_max_changes = Perceptron(max_iter=2000) books_Y2_pred = run_classifier_pipe(vec, books_clf_iter_max_changes, train_books_X, train_books_Y, test_books_X) print_scores(test_books_Y, books_Y2_pred)

books_clf_iter_min_changes = Perceptron(max_iter=5) books_Y3_pred = run_classifier_pipe(vec, books_clf_iter_min_changes, train_books_X, train_books_Y, test_books_X) print_scores(test_books_Y, books_Y3_pred)

from sklearn.neighbors import KNeighborsClassifier domain_train_X = mixed_train['text'] domain_train_Y = mixed_train['domain'] domain_test_X = mixed_test['text'] domain_test_Y = mixed_test['domain'] vec = CountVectorizer(analyzer='char_wb', ngram_range=(2,5), stop_words='english', max_df=0.95)

clf = KNeighborsClassifier() domain_Y_pred = run_classifier_pipe(vec, clf, domain_train_X, domain_train_Y, domain_test_X) print_scores(domain_test_Y, domain_Y_pred)

clf_2 = KNeighborsClassifier(n_neighbors=2) domain_Y2_pred = run_classifier_pipe(vec, clf_2, domain_train_X, domain_train_Y, domain_test_X) print_scores(domain_test_Y, domain_Y2_pred)

clf_7 = KNeighborsClassifier(n_neighbors=7) domain_Y7_pred = run_classifier_pipe(vec, clf_7, domain_train_X, domain_train_Y, domain_test_X) print_scores(domain_test_Y, domain_Y7_pred)

clf_10 = KNeighborsClassifier(n_neighbors=10, weights='distance') domain_Y10_pred = run_classifier_pipe(vec, clf_10, domain_train_X, domain_train_Y, domain_test_X) print_scores(domain_test_Y, domain_Y10_pred)

clf_15 = KNeighborsClassifier(n_neighbors=15) domain_Y15_pred = run_classifier_pipe(vec, clf_15, domain_train_X, domain_train_Y, domain_test_X) print_scores(domain_test_Y, domain_Y15_pred)

clf_20 = KNeighborsClassifier(n_neighbors=20) domain_Y20_pred = run_classifier_pipe(vec, clf_20, domain_train_X, domain_train_Y, domain_test_X) print_scores(domain_test_Y, domain_Y20_pred)

from sklearn.naive_bayes import MultinomialNB bayes_train_X = mixed_train['text'] bayes_train_Y = mixed_train['rating'] bayes_test_X = mixed_test['text'] bayes_test_Y = mixed_test['rating'] vec = CountVectorizer() clf_bayes = MultinomialNB()

bayes_pred_Y = run_classifier_pipe(vec, clf_bayes, bayes_train_X, bayes_train_Y, bayes_test_X) print_scores(bayes_test_Y, bayes_pred_Y) print(f"Micro-average precision: {precision_score(bayes_test_Y, bayes_pred_Y, average='micro')}")

clf = MultinomialNB() bayes_train_Y = mixed_train['polarity'] bayes_test_Y = mixed_test['polarity'] bayes_pred_Y = run_classifier_pipe(vec, clf, bayes_train_X, bayes_train_Y, bayes_test_X) print_scores(bayes_test_Y, bayes_pred_Y)

from sklearn.tree import DecisionTreeClassifier train_X = mixed_train['text'] train_Y = mixed_train['polarity'] test_X = mixed_test['text'] test_Y = mixed_test['polarity'] vec = CountVectorizer() clf = DecisionTreeClassifier()

Y_pred = run_classifier_pipe(vec, clf, train_X, train_Y, test_X) print_scores(test_Y, Y_pred)

clf_r = DecisionTreeClassifier(splitter="random") Y_pred_r = run_classifier_pipe(vec, clf_r, train_X, train_Y, test_X) print_scores(test_Y, Y_pred_r)

clf_c = DecisionTreeClassifier(criterion="entropy") Y_pred_c = run_classifier_pipe(vec, clf_c, train_X, train_Y, test_X) print_scores(test_Y, Y_pred_c)

clf_ms = DecisionTreeClassifier(min_samples_split=10, min_samples_leaf=10) Y_pred_ms = run_classifier_pipe(vec, clf_ms, train_X, train_Y, test_X) print_scores(test_Y, Y_pred_ms)

clf_md = DecisionTreeClassifier(criterion="entropy", max_depth=2) Y_pred_md = run_classifier_pipe(vec, clf_md, train_X, train_Y, test_X) print_scores(test_Y, Y_pred_md)

clf_cw = DecisionTreeClassifier(class_weight={"positive": 0.01, "negative": 0.99}) Y_pred_cw = run_classifier_pipe(vec, clf_cw, train_X, train_Y, test_X) print_scores(test_Y, Y_pred_cw)

from sklearn.svm import SVC train_X = mixed_train["text"] train_Y = mixed_train["polarity"] test_X = mixed_test["text"] test_Y = mixed_test["polarity"] vec = CountVectorizer() clf_linear_c_1 = SVC(kernel="linear") # 1 is the default parameter for C clf_linear_c_2 = SVC(kernel="linear", C=2.0) clf_linear_c_4 = SVC(kernel="linear", C=4.0) clf_linear_c_8 = SVC(kernel="linear", C=8.0) clf_linear_c_128 = SVC(kernel="linear", C=128.0)

y_pred_lin_1 = run_classifier_pipe(vec, clf_linear_c_1, train_X, train_Y, test_X) print_scores(test_Y, y_pred_lin_1)

y_pred_lin_2 = run_classifier_pipe(vec, clf_linear_c_2, train_X, train_Y, test_X) print_scores(test_Y, y_pred_lin_2)

y_pred_lin_4 = run_classifier_pipe(vec, clf_linear_c_4, train_X, train_Y, test_X) print_scores(test_Y, y_pred_lin_4)

y_pred_lin_8 = run_classifier_pipe(vec, clf_linear_c_8, train_X, train_Y, test_X) print_scores(test_Y, y_pred_lin_8)

y_pred_lin_128 = run_classifier_pipe(vec, clf_linear_c_128, train_X, train_Y, test_X) print_scores(test_Y, y_pred_lin_128)

clf_rbf_1_scale = SVC() # default kernel is rbf already clf_rbf_2_scale = SVC(C=2.0) clf_rbf_128_scale = SVC(C=128.0) clf_rbf_1_auto = SVC(gamma="auto") clf_rbf_128_auto = SVC(C=128.0, gamma="auto") clf_rbf_1_4 = SVC(gamma=4.0) clf_rbf_128_128 = SVC(C=128.0, gamma=128.0) clf_rbf_1_05 = SVC(C=1, gamma=0.5)

y_pred_rbf_1_scale = run_classifier_pipe(vec, clf_rbf_1_scale, train_X, train_Y, test_X) print_scores(test_Y, y_pred_rbf_1_scale)

y_pred_rbf_2_scale = run_classifier_pipe(vec, clf_rbf_2_scale, train_X, train_Y, test_X) print_scores(test_Y, y_pred_rbf_2_scale)

y_pred_rbf_128_scale = run_classifier_pipe(vec, clf_rbf_128_scale, train_X, train_Y, test_X) print_scores(test_Y, y_pred_rbf_128_scale)

y_pred_rbf_1_auto = run_classifier_pipe(vec, clf_rbf_1_auto, train_X, train_Y, test_X) print_scores(test_Y, y_pred_rbf_1_auto)

y_pred_rbf_128_auto = run_classifier_pipe(vec, clf_rbf_128_auto, train_X, train_Y, test_X) print_scores(test_Y, y_pred_rbf_128_auto)

y_pred_rbf_1_4 = run_classifier_pipe(vec, clf_rbf_1_4, train_X, train_Y, test_X) print_scores(test_Y, y_pred_rbf_1_4)

y_pred_rbf_128_128 = run_classifier_pipe(vec, clf_rbf_128_128, train_X, train_Y, test_X) print_scores(test_Y, y_pred_rbf_128_128)

y_pred_rbf_1_05 = run_classifier_pipe(vec, clf_rbf_1_05, train_X, train_Y, test_X) print_scores(test_Y, y_pred_rbf_1_05)

books_train_X = books_train['text'] books_train_Y = books_train['polarity'] dvd_train_X = dvd_train["text"] dvd_train_Y = dvd_train["polarity"] electronics_train_X = electronics_train["text"] electronics_train_Y = electronics_train["polarity"] kitchen_train_X = kitchen_train["text"] kitchen_train_X = kitchen_train["polarity"] books_test_X = books_test['text'] books_test_Y = books_test['polarity'] dvd_test_X = dvd_test["text"] dvd_test_Y = dvd_test["polarity"] electronics_test_X = electronics_test["text"] electronics_test_Y = electronics_test["polarity"] kitchen_test_X = kitchen_test["text"] kitchen_test_X = kitchen_test["polarity"]

from sklearn.pipeline import Pipeline from sklearn.linear_model import Perceptron books_clf = Perceptron() dvd_clf = Perceptron() electronics_clf = Perceptron() kitchen_clf = Perceptron() vec = CountVectorizer() vec2 = CountVectorizer() vec3 = CountVectorizer() vec4 = CountVectorizer() def create_train_pipeline(vec, clf, train_X, train_Y): pipeline = Pipeline([ ('vectorizer', vec), ('classifier', clf) ]) pipeline.fit(train_X, train_Y) return pipeline books_model = create_train_pipeline(vec, books_clf, books_train_X, books_train_Y) dvd_model = create_train_pipeline(vec2, dvd_clf, dvd_train_X, dvd_train_Y) electronics_model = create_train_pipeline(vec3, electronics_clf, electronics_train_X, electronics_train_Y) kitchen_model = create_train_pipeline(vec4, kitchen_clf, kitchen_train_X, kitchen_train_X)

b_books_y_pred = books_model.predict(books_test_X) b_dvd_y_pred = books_model.predict(dvd_test_X) b_elec_y_pred = books_model.predict(electronics_test_X) b_kitchen_y_pred = books_model.predict(kitchen_test_X)

d_books_y_pred = dvd_model.predict(books_test_X) d_dvd_y_pred = dvd_model.predict(dvd_test_X) d_elec_y_pred = dvd_model.predict(electronics_test_X) d_kitchen_y_pred = dvd_model.predict(kitchen_test_X)

e_books_y_pred = electronics_model.predict(books_test_X) e_dvd_y_pred = electronics_model.predict(dvd_test_X) e_elec_y_pred = electronics_model.predict(electronics_test_X) e_kitchen_y_pred = electronics_model.predict(kitchen_test_X)

k_books_y_pred = kitchen_model.predict(books_test_X) k_dvd_y_pred = kitchen_model.predict(dvd_test_X) k_elec_y_pred = kitchen_model.predict(electronics_test_X) k_kitchen_y_pred = kitchen_model.predict(kitchen_test_X)

mixed_train_X = mixed_train["text"] mixed_train_Y = mixed_train["polarity"] mixed_test_X = mixed_test["text"] mixed_test_Y = mixed_test["polarity"] clf_mixed = Perceptron() mixed_model = create_train_pipeline(vec, clf_mixed, mixed_train_X, mixed_train_Y)

m_books_y_pred = mixed_model.predict(books_test_X) m_dvd_y_pred = mixed_model.predict(dvd_test_X) m_elec_y_pred = mixed_model.predict(electronics_test_X) m_kitchen_y_pred = mixed_model.predict(kitchen_test_X) m_mixed_y_pred = mixed_model.predict(mixed_test_Y)