import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
# PARSE DATA
books_reviews = []
dvd_reviews = []
electronics_reviews = []
kitchen_reviews = []
f = open("data/reviews.txt", "r")
data = f.read().splitlines()
review = {'rating': '', 'text': '', 'domain': ''}
for x in range(len(data)):
if data[x] == '<rating>':
review['rating'] = data[x+1]
if data[x] == '<review_text>':
review['text'] = data[x+1]
if data[x] == '<product_type>':
review['domain'] = data[x+1]
if data[x] == '<unique_id>' and review != {'rating': '', 'text': '', 'domain': ''}:
if review['domain'] == 'books':
books_reviews.append(review)
if review['domain'] == 'dvd':
dvd_reviews.append(review)
if review['domain'] == 'electronics':
electronics_reviews.append(review)
if review['domain'] == 'kitchen & housewares':
kitchen_reviews.append(review)
review = {'rating': '', 'text': '', 'domain': ''}
import pandas as pd
def transform_to_dataframe(reviews):
return pd.DataFrame(data={'rating': list(map(lambda x: x["rating"], reviews)),
'text': list(map(lambda x: x["text"], reviews)),
'domain': reviews[0]["domain"],
'polarity': list(map(lambda x: "positive" if x["rating"] == "4.0" or x['rating'] == "5.0" else "negative", reviews))})
books_reviews = transform_to_dataframe(books_reviews)
dvd_reviews = transform_to_dataframe(dvd_reviews)
electronics_reviews = transform_to_dataframe(electronics_reviews)
kitchen_reviews = transform_to_dataframe(kitchen_reviews)
mixed = pd.concat([books_reviews, dvd_reviews, electronics_reviews, kitchen_reviews]).reset_index(drop=True)
mixed['rating'] = pd.Categorical(mixed['rating'], categories=['1.0', '2.0', '4.0', '5.0'])
ax = sns.displot(mixed, x="rating", multiple='dodge', hue='domain')
books_reviews = books_reviews.sample(frac=1, random_state=42).reset_index(drop=True)
dvd_reviews = dvd_reviews.sample(frac=1, random_state=42).reset_index(drop=True)
electronics_reviews = electronics_reviews.sample(frac=1, random_state=42).reset_index(drop=True)
kitchen_reviews = kitchen_reviews.sample(frac=1, random_state=42).reset_index(drop=True)
dvd_reviews.head()
from sklearn.model_selection import train_test_split
books_train, books_test = train_test_split(books_reviews, test_size=0.2, random_state=42)
dvd_train, dvd_test = train_test_split(dvd_reviews, test_size=0.2, random_state=42)
electronics_train, electronics_test = train_test_split(electronics_reviews, test_size=0.2, random_state=42)
kitchen_train, kitchen_test = train_test_split(kitchen_reviews, test_size=0.2, random_state=42)
mixed_train = pd.concat([books_train, dvd_train, electronics_train, kitchen_train])
mixed_train = mixed_train.sample(frac=1, random_state=42).reset_index(drop=True)
mixed_train.head()
mixed_test = pd.concat([books_test, dvd_test, electronics_test, kitchen_test])
mixed_test = mixed_test.sample(frac=1, random_state=42).reset_index(drop=True)
mixed_test.head()
import seaborn as sns
def distribution_table(dataset):
ax = sns.histplot(data=dataset, x="rating")
distribution_table(mixed_train)
distribution_table(mixed_test)
import numpy as np
unique, amount = np.unique(mixed_test["rating"].to_numpy(), return_counts=True)
counts = dict(zip(unique, amount))
print(counts)
sum = 0
for x in counts.values():
sum += x
tp = counts['5.0'] / sum
print(tp)
precision = tp / (tp + (1 - tp)) # FP is (1 - TP) in this case
recall = tp / (tp + 0)
f_score = (2 * precision * recall) / (precision + recall)
print(f"precision: {precision}, recall: {recall}, f_score: {f_score}")
from sklearn.dummy import DummyClassifier
X = books_train['text'] # list(map(lambda x : x["text"], books_train))
y = books_train['rating'] # list(map(lambda x : x["rating"], books_train))
dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X, y)
y_pred = dummy_clf.predict(books_test['text'])
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
def print_scores(test_Y, test_Y_pred):
print(f"Confusion matrix:\n {confusion_matrix(test_Y, test_Y_pred)}")
print(f"Accuracy: {accuracy_score(test_Y, test_Y_pred)}")
print(f"Precision: {precision_score(test_Y, test_Y_pred, average='macro')}")
print(f"Recall: {recall_score(test_Y, test_Y_pred, average='macro')}")
print(f"F-score: {f1_score(test_Y, test_Y_pred, average='macro')}")
print_scores(books_test['rating'], y_pred)
train_X = mixed_train['text']
train_Y = mixed_train['polarity']
test_X = mixed_test['text']
test_Y = mixed_test['polarity']
from sklearn.pipeline import Pipeline
def run_classifier_pipe(vec, clf, train_X, train_Y, test_X):
pipeline = Pipeline([
('vectorizer', vec),
('classifier', clf)
])
pipeline.fit(train_X, train_Y)
return pipeline.predict(test_X)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
vec = TfidfVectorizer(stop_words="english")
clf = Perceptron()
test_Y_pred_Tfidf = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_Tfidf)
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
clf = Perceptron()
test_Y_pred_counts = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_counts)
vec = CountVectorizer(ngram_range=(4, 4))
clf = Perceptron()
test_Y_pred_counts_ng44 = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_counts_ng44)
vec = CountVectorizer(analyzer='word', ngram_range=(2, 4), lowercase=False)
clf = Perceptron()
test_Y_pred_counts_ng24 = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_counts_ng24)
vec = CountVectorizer(analyzer='char', ngram_range=(2, 10))
clf = Perceptron()
test_Y_pred_counts_char = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_counts_char)
vec = CountVectorizer(analyzer='char_wb', ngram_range=(2, 10))
clf = Perceptron()
test_Y_pred_counts_wbchar = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_counts_wbchar)
from sklearn.feature_extraction.text import HashingVectorizer
vec = HashingVectorizer()
clf = Perceptron()
test_Y_pred_hashing = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_hashing)
from sklearn.feature_extraction.text import HashingVectorizer
vec = HashingVectorizer(n_features=2**4)
clf = Perceptron()
test_Y_pred_hashing_2p4 = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_hashing_2p4)
from sklearn.feature_extraction.text import HashingVectorizer
vec = HashingVectorizer(n_features=(2**31 - 1))
clf = Perceptron()
test_Y_pred_hashing_2p31 = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_hashing_2p31)
overviewtable = pd.DataFrame(data={
'TF-IDF': [
round(accuracy_score(test_Y, test_Y_pred_Tfidf), 4),
round(precision_score(test_Y, test_Y_pred_Tfidf, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_Tfidf, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_Tfidf, average='macro'), 4)
] ,
'Counts': [
round(accuracy_score(test_Y, test_Y_pred_counts), 4),
round(precision_score(test_Y, test_Y_pred_counts, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_counts, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_counts, average='macro'), 4)
],
'Counts ngrams=4,4': [
round(accuracy_score(test_Y, test_Y_pred_counts_ng44), 4),
round(precision_score(test_Y, test_Y_pred_counts_ng44, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_counts_ng44, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_counts_ng44, average='macro'), 4)
],
'Counts ngrams=2,4': [
round(accuracy_score(test_Y, test_Y_pred_counts_ng24), 4),
round(precision_score(test_Y, test_Y_pred_counts_ng24, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_counts_ng24, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_counts_ng24, average='macro'), 4)
],
'Counts char': [
round(accuracy_score(test_Y, test_Y_pred_counts_char), 4),
round(precision_score(test_Y, test_Y_pred_counts_char, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_counts_char, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_counts_char, average='macro'), 4)
],
'Counts wb_char': [
round(accuracy_score(test_Y, test_Y_pred_counts_wbchar), 4),
round(precision_score(test_Y, test_Y_pred_counts_wbchar, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_counts_wbchar, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_counts_wbchar, average='macro'), 4)
],
'Hashing': [
round(accuracy_score(test_Y, test_Y_pred_hashing), 4),
round(precision_score(test_Y, test_Y_pred_hashing, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_hashing, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_hashing, average='macro'), 4)
],
'Hashing 2^4': [
round(accuracy_score(test_Y, test_Y_pred_hashing_2p4), 4),
round(precision_score(test_Y, test_Y_pred_hashing_2p4, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_hashing_2p4, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_hashing_2p4, average='macro'), 4)
],
'Hashing 2^31-1': [
round(accuracy_score(test_Y, test_Y_pred_hashing_2p31), 4),
round(precision_score(test_Y, test_Y_pred_hashing_2p31, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_hashing_2p31, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_hashing_2p31, average='macro'), 4)
]
}, index=['Accuracy', 'Precision', 'Recall', 'F-score'])
overviewtable.transpose().head(10)
train_books_X = books_train['text']
train_books_Y = books_train['polarity']
test_books_X = books_test['text']
test_books_Y = books_test['polarity']
train_dvd_X = dvd_train['text']
train_dvd_Y = dvd_train['polarity']
test_dvd_X = dvd_test['text']
test_dvd_Y = dvd_test['polarity']
train_electronics_X = electronics_train['text']
train_electronics_Y = electronics_train['polarity']
test_electronics_X = electronics_test['text']
test_electronics_Y = electronics_test['polarity']
train_kitchen_X = kitchen_train['text']
train_kitchen_Y = kitchen_train['polarity']
test_kitchen_X = kitchen_test['text']
test_kitchen_Y = kitchen_test['polarity']
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction.text import CountVectorizer
books_clf = Perceptron()
dvd_clf = Perceptron()
electronics_clf = Perceptron()
kitchen_clf = Perceptron()
vec = CountVectorizer()
books_Y_pred = run_classifier_pipe(vec, books_clf, train_books_X, train_books_Y, test_books_X)
print_scores(test_books_Y, books_Y_pred)
dvd_Y_pred = run_classifier_pipe(vec, dvd_clf, train_dvd_X, train_dvd_Y, test_dvd_X)
print_scores(test_dvd_Y, dvd_Y_pred)
electronics_Y_pred = run_classifier_pipe(vec, electronics_clf, train_electronics_X, train_electronics_Y, test_electronics_X)
print_scores(test_electronics_Y, electronics_Y_pred)
kitchen_Y_pred = run_classifier_pipe(vec, kitchen_clf, train_kitchen_X, train_kitchen_Y, test_kitchen_X)
print_scores(test_kitchen_Y, kitchen_Y_pred)
books_clf_iter_max_changes = Perceptron(max_iter=2000)
books_Y2_pred = run_classifier_pipe(vec, books_clf_iter_max_changes, train_books_X, train_books_Y, test_books_X)
print_scores(test_books_Y, books_Y2_pred)
books_clf_iter_min_changes = Perceptron(max_iter=5)
books_Y3_pred = run_classifier_pipe(vec, books_clf_iter_min_changes, train_books_X, train_books_Y, test_books_X)
print_scores(test_books_Y, books_Y3_pred)
from sklearn.neighbors import KNeighborsClassifier
domain_train_X = mixed_train['text']
domain_train_Y = mixed_train['domain']
domain_test_X = mixed_test['text']
domain_test_Y = mixed_test['domain']
vec = CountVectorizer(analyzer='char_wb', ngram_range=(2,5), stop_words='english', max_df=0.95)
clf = KNeighborsClassifier()
domain_Y_pred = run_classifier_pipe(vec, clf, domain_train_X, domain_train_Y, domain_test_X)
print_scores(domain_test_Y, domain_Y_pred)
clf_2 = KNeighborsClassifier(n_neighbors=2)
domain_Y2_pred = run_classifier_pipe(vec, clf_2, domain_train_X, domain_train_Y, domain_test_X)
print_scores(domain_test_Y, domain_Y2_pred)
clf_7 = KNeighborsClassifier(n_neighbors=7)
domain_Y7_pred = run_classifier_pipe(vec, clf_7, domain_train_X, domain_train_Y, domain_test_X)
print_scores(domain_test_Y, domain_Y7_pred)
clf_10 = KNeighborsClassifier(n_neighbors=10, weights='distance')
domain_Y10_pred = run_classifier_pipe(vec, clf_10, domain_train_X, domain_train_Y, domain_test_X)
print_scores(domain_test_Y, domain_Y10_pred)
clf_15 = KNeighborsClassifier(n_neighbors=15)
domain_Y15_pred = run_classifier_pipe(vec, clf_15, domain_train_X, domain_train_Y, domain_test_X)
print_scores(domain_test_Y, domain_Y15_pred)
clf_20 = KNeighborsClassifier(n_neighbors=20)
domain_Y20_pred = run_classifier_pipe(vec, clf_20, domain_train_X, domain_train_Y, domain_test_X)
print_scores(domain_test_Y, domain_Y20_pred)
from sklearn.naive_bayes import MultinomialNB
bayes_train_X = mixed_train['text']
bayes_train_Y = mixed_train['rating']
bayes_test_X = mixed_test['text']
bayes_test_Y = mixed_test['rating']
vec = CountVectorizer()
clf_bayes = MultinomialNB()
bayes_pred_Y = run_classifier_pipe(vec, clf_bayes, bayes_train_X, bayes_train_Y, bayes_test_X)
print_scores(bayes_test_Y, bayes_pred_Y)
print(f"Micro-average precision: {precision_score(bayes_test_Y, bayes_pred_Y, average='micro')}")
clf = MultinomialNB()
bayes_train_Y = mixed_train['polarity']
bayes_test_Y = mixed_test['polarity']
bayes_pred_Y = run_classifier_pipe(vec, clf, bayes_train_X, bayes_train_Y, bayes_test_X)
print_scores(bayes_test_Y, bayes_pred_Y)
from sklearn.tree import DecisionTreeClassifier
train_X = mixed_train['text']
train_Y = mixed_train['polarity']
test_X = mixed_test['text']
test_Y = mixed_test['polarity']
vec = CountVectorizer()
clf = DecisionTreeClassifier()
Y_pred = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, Y_pred)
clf_r = DecisionTreeClassifier(splitter="random")
Y_pred_r = run_classifier_pipe(vec, clf_r, train_X, train_Y, test_X)
print_scores(test_Y, Y_pred_r)
clf_c = DecisionTreeClassifier(criterion="entropy")
Y_pred_c = run_classifier_pipe(vec, clf_c, train_X, train_Y, test_X)
print_scores(test_Y, Y_pred_c)
clf_ms = DecisionTreeClassifier(min_samples_split=10, min_samples_leaf=10)
Y_pred_ms = run_classifier_pipe(vec, clf_ms, train_X, train_Y, test_X)
print_scores(test_Y, Y_pred_ms)
clf_md = DecisionTreeClassifier(criterion="entropy", max_depth=2)
Y_pred_md = run_classifier_pipe(vec, clf_md, train_X, train_Y, test_X)
print_scores(test_Y, Y_pred_md)
clf_cw = DecisionTreeClassifier(class_weight={"positive": 0.01, "negative": 0.99})
Y_pred_cw = run_classifier_pipe(vec, clf_cw, train_X, train_Y, test_X)
print_scores(test_Y, Y_pred_cw)
from sklearn.svm import SVC
train_X = mixed_train["text"]
train_Y = mixed_train["polarity"]
test_X = mixed_test["text"]
test_Y = mixed_test["polarity"]
vec = CountVectorizer()
clf_linear_c_1 = SVC(kernel="linear") # 1 is the default parameter for C
clf_linear_c_2 = SVC(kernel="linear", C=2.0)
clf_linear_c_4 = SVC(kernel="linear", C=4.0)
clf_linear_c_8 = SVC(kernel="linear", C=8.0)
clf_linear_c_128 = SVC(kernel="linear", C=128.0)
y_pred_lin_1 = run_classifier_pipe(vec, clf_linear_c_1, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_lin_1)
y_pred_lin_2 = run_classifier_pipe(vec, clf_linear_c_2, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_lin_2)
y_pred_lin_4 = run_classifier_pipe(vec, clf_linear_c_4, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_lin_4)
y_pred_lin_8 = run_classifier_pipe(vec, clf_linear_c_8, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_lin_8)
y_pred_lin_128 = run_classifier_pipe(vec, clf_linear_c_128, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_lin_128)
clf_rbf_1_scale = SVC() # default kernel is rbf already
clf_rbf_2_scale = SVC(C=2.0)
clf_rbf_128_scale = SVC(C=128.0)
clf_rbf_1_auto = SVC(gamma="auto")
clf_rbf_128_auto = SVC(C=128.0, gamma="auto")
clf_rbf_1_4 = SVC(gamma=4.0)
clf_rbf_128_128 = SVC(C=128.0, gamma=128.0)
clf_rbf_1_05 = SVC(C=1, gamma=0.5)
y_pred_rbf_1_scale = run_classifier_pipe(vec, clf_rbf_1_scale, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_1_scale)
y_pred_rbf_2_scale = run_classifier_pipe(vec, clf_rbf_2_scale, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_2_scale)
y_pred_rbf_128_scale = run_classifier_pipe(vec, clf_rbf_128_scale, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_128_scale)
y_pred_rbf_1_auto = run_classifier_pipe(vec, clf_rbf_1_auto, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_1_auto)
y_pred_rbf_128_auto = run_classifier_pipe(vec, clf_rbf_128_auto, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_128_auto)
y_pred_rbf_1_4 = run_classifier_pipe(vec, clf_rbf_1_4, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_1_4)
y_pred_rbf_128_128 = run_classifier_pipe(vec, clf_rbf_128_128, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_128_128)
y_pred_rbf_1_05 = run_classifier_pipe(vec, clf_rbf_1_05, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_1_05)
books_train_X = books_train['text']
books_train_Y = books_train['polarity']
dvd_train_X = dvd_train["text"]
dvd_train_Y = dvd_train["polarity"]
electronics_train_X = electronics_train["text"]
electronics_train_Y = electronics_train["polarity"]
kitchen_train_X = kitchen_train["text"]
kitchen_train_X = kitchen_train["polarity"]
books_test_X = books_test['text']
books_test_Y = books_test['polarity']
dvd_test_X = dvd_test["text"]
dvd_test_Y = dvd_test["polarity"]
electronics_test_X = electronics_test["text"]
electronics_test_Y = electronics_test["polarity"]
kitchen_test_X = kitchen_test["text"]
kitchen_test_X = kitchen_test["polarity"]
train_books_X = books_train['text']
train_books_Y = books_train['polarity']
test_books_X = books_test['text']
test_books_Y = books_test['polarity']
train_dvd_X = dvd_train['text']
train_dvd_Y = dvd_train['polarity']
test_dvd_X = dvd_test['text']
test_dvd_Y = dvd_test['polarity']
train_electronics_X = electronics_train['text']
train_electronics_Y = electronics_train['polarity']
test_electronics_X = electronics_test['text']
test_electronics_Y = electronics_test['polarity']
train_kitchen_X = kitchen_train['text']
train_kitchen_Y = kitchen_train['polarity']
test_kitchen_X = kitchen_test['text']
test_kitchen_Y = kitchen_test['polarity']
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Perceptron
books_clf = Perceptron()
dvd_clf = Perceptron()
electronics_clf = Perceptron()
kitchen_clf = Perceptron()
vec = CountVectorizer()
vec2 = CountVectorizer()
vec3 = CountVectorizer()
vec4 = CountVectorizer()
def create_train_pipeline(vec, clf, train_X, train_Y):
pipeline = Pipeline([
('vectorizer', vec),
('classifier', clf)
])
pipeline.fit(train_X, train_Y)
return pipeline
books_model = create_train_pipeline(vec, books_clf, books_train_X, books_train_Y)
dvd_model = create_train_pipeline(vec2, dvd_clf, dvd_train_X, dvd_train_Y)
electronics_model = create_train_pipeline(vec3, electronics_clf, electronics_train_X, electronics_train_Y)
kitchen_model = create_train_pipeline(vec4, kitchen_clf, kitchen_train_X, kitchen_train_X)
b_books_y_pred = books_model.predict(books_test_X)
b_dvd_y_pred = books_model.predict(dvd_test_X)
b_elec_y_pred = books_model.predict(electronics_test_X)
b_kitchen_y_pred = books_model.predict(kitchen_test_X)
d_books_y_pred = dvd_model.predict(books_test_X)
d_dvd_y_pred = dvd_model.predict(dvd_test_X)
d_elec_y_pred = dvd_model.predict(electronics_test_X)
d_kitchen_y_pred = dvd_model.predict(kitchen_test_X)
e_books_y_pred = electronics_model.predict(books_test_X)
e_dvd_y_pred = electronics_model.predict(dvd_test_X)
e_elec_y_pred = electronics_model.predict(electronics_test_X)
e_kitchen_y_pred = electronics_model.predict(kitchen_test_X)
k_books_y_pred = kitchen_model.predict(books_test_X)
k_dvd_y_pred = kitchen_model.predict(dvd_test_X)
k_elec_y_pred = kitchen_model.predict(electronics_test_X)
k_kitchen_y_pred = kitchen_model.predict(kitchen_test_X)
mixed_train_X = mixed_train["text"]
mixed_train_Y = mixed_train["polarity"]
mixed_test_X = mixed_test["text"]
mixed_test_Y = mixed_test["polarity"]
clf_mixed = Perceptron()
mixed_model = create_train_pipeline(vec, clf_mixed, mixed_train_X, mixed_train_Y)
m_books_y_pred = mixed_model.predict(books_test_X)
m_dvd_y_pred = mixed_model.predict(dvd_test_X)
m_elec_y_pred = mixed_model.predict(electronics_test_X)
m_kitchen_y_pred = mixed_model.predict(kitchen_test_X)
m_mixed_y_pred = mixed_model.predict(mixed_test_Y)