import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
# PARSE DATA
books_reviews = []
dvd_reviews = []
electronics_reviews = []
kitchen_reviews = []
f = open("data/reviews.txt", "r")
data = f.read().splitlines()
review = {'rating': '', 'text': '', 'domain': ''}
for x in range(len(data)):
if data[x] == '<rating>':
review['rating'] = data[x+1]
if data[x] == '<review_text>':
review['text'] = data[x+1]
if data[x] == '<product_type>':
review['domain'] = data[x+1]
if data[x] == '<unique_id>' and review != {'rating': '', 'text': '', 'domain': ''}:
if review['domain'] == 'books':
books_reviews.append(review)
if review['domain'] == 'dvd':
dvd_reviews.append(review)
if review['domain'] == 'electronics':
electronics_reviews.append(review)
if review['domain'] == 'kitchen & housewares':
kitchen_reviews.append(review)
review = {'rating': '', 'text': '', 'domain': ''}
import pandas as pd
def transform_to_dataframe(reviews):
return pd.DataFrame(data={'rating': list(map(lambda x: x["rating"], reviews)),
'text': list(map(lambda x: x["text"], reviews)),
'domain': reviews[0]["domain"],
'polarity': list(map(lambda x: "positive" if x["rating"] == "4.0" or x['rating'] == "5.0" else "negative", reviews))})
books_reviews = transform_to_dataframe(books_reviews)
dvd_reviews = transform_to_dataframe(dvd_reviews)
electronics_reviews = transform_to_dataframe(electronics_reviews)
kitchen_reviews = transform_to_dataframe(kitchen_reviews)
mixed = pd.concat([books_reviews, dvd_reviews, electronics_reviews, kitchen_reviews]).reset_index(drop=True)
mixed['rating'] = pd.Categorical(mixed['rating'], categories=['1.0', '2.0', '4.0', '5.0'])
ax = sns.displot(mixed, x="rating", multiple='dodge', hue='domain')
books_reviews = books_reviews.sample(frac=1, random_state=42).reset_index(drop=True)
dvd_reviews = dvd_reviews.sample(frac=1, random_state=42).reset_index(drop=True)
electronics_reviews = electronics_reviews.sample(frac=1, random_state=42).reset_index(drop=True)
kitchen_reviews = kitchen_reviews.sample(frac=1, random_state=42).reset_index(drop=True)
dvd_reviews.head()
from sklearn.model_selection import train_test_split
books_train, books_test = train_test_split(books_reviews, test_size=0.2, random_state=42)
dvd_train, dvd_test = train_test_split(dvd_reviews, test_size=0.2, random_state=42)
electronics_train, electronics_test = train_test_split(electronics_reviews, test_size=0.2, random_state=42)
kitchen_train, kitchen_test = train_test_split(kitchen_reviews, test_size=0.2, random_state=42)
mixed_train = pd.concat([books_train, dvd_train, electronics_train, kitchen_train])
mixed_train = mixed_train.sample(frac=1, random_state=42).reset_index(drop=True)
mixed_train.head()
mixed_test = pd.concat([books_test, dvd_test, electronics_test, kitchen_test])
mixed_test = mixed_test.sample(frac=1, random_state=42).reset_index(drop=True)
mixed_test.head()
import seaborn as sns
def distribution_table(dataset):
ax = sns.histplot(data=dataset, x="rating")
distribution_table(mixed_train)
distribution_table(mixed_test)
import numpy as np
unique, amount = np.unique(mixed_test["rating"].to_numpy(), return_counts=True)
counts = dict(zip(unique, amount))
print(counts)
sum = 0
for x in counts.values():
sum += x
tp = counts['5.0'] / sum
print(tp)
{'1.0': 510, '2.0': 311, '4.0': 233, '5.0': 546}
0.34125
precision = tp / (tp + (1 - tp)) # FP is (1 - TP) in this case
recall = tp / (tp + 0)
f_score = (2 * precision * recall) / (precision + recall)
print(f"precision: {precision}, recall: {recall}, f_score: {f_score}")
precision: 0.34125, recall: 1.0, f_score: 0.5088536812674743
from sklearn.dummy import DummyClassifier
X = books_train['text'] # list(map(lambda x : x["text"], books_train))
y = books_train['rating'] # list(map(lambda x : x["rating"], books_train))
dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X, y)
y_pred = dummy_clf.predict(books_test['text'])
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
def print_scores(test_Y, test_Y_pred):
print(f"Confusion matrix:\n {confusion_matrix(test_Y, test_Y_pred)}")
print(f"Accuracy: {accuracy_score(test_Y, test_Y_pred)}")
print(f"Precision: {precision_score(test_Y, test_Y_pred, average='macro')}")
print(f"Recall: {recall_score(test_Y, test_Y_pred, average='macro')}")
print(f"F-score: {f1_score(test_Y, test_Y_pred, average='macro')}")
print_scores(books_test['rating'], y_pred)
Confusion matrix:
[[26 28 16 43]
[20 17 10 37]
[14 16 5 28]
[42 32 18 48]]
Accuracy: 0.24
Precision: 0.2118576959319708
Recall: 0.21367291754459897
F-score: 0.21189022480163588
train_X = mixed_train['text']
train_Y = mixed_train['polarity']
test_X = mixed_test['text']
test_Y = mixed_test['polarity']
from sklearn.pipeline import Pipeline
def run_classifier_pipe(vec, clf, train_X, train_Y, test_X):
pipeline = Pipeline([
('vectorizer', vec),
('classifier', clf)
])
pipeline.fit(train_X, train_Y)
return pipeline.predict(test_X)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
vec = TfidfVectorizer(stop_words="english")
clf = Perceptron()
test_Y_pred_Tfidf = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_Tfidf)
Confusion matrix:
[[586 235]
[197 582]]
Accuracy: 0.73
Precision: 0.730382938545687
Recall: 0.7304376922222969
F-score: 0.7299983124894531
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
clf = Perceptron()
test_Y_pred_counts = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_counts)
Confusion matrix:
[[659 162]
[239 540]]
Accuracy: 0.749375
Precision: 0.7515418879561419
Recall: 0.7479380323003819
F-score: 0.7479809210244639
vec = CountVectorizer(ngram_range=(4, 4))
clf = Perceptron()
test_Y_pred_counts_ng44 = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_counts_ng44)
Confusion matrix:
[[408 413]
[144 635]]
Accuracy: 0.651875
Precision: 0.6725232326584799
Recall: 0.6560512790844941
F-score: 0.6447238178157133
vec = CountVectorizer(analyzer='word', ngram_range=(2, 4), lowercase=False)
clf = Perceptron()
test_Y_pred_counts_ng24 = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_counts_ng24)
Confusion matrix:
[[624 197]
[232 547]]
Accuracy: 0.731875
Precision: 0.7320935081901316
Recall: 0.7311155030263041
F-score: 0.7312525767686175
vec = CountVectorizer(analyzer='char', ngram_range=(2, 10))
clf = Perceptron()
test_Y_pred_counts_char = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_counts_char)
Confusion matrix:
[[646 175]
[167 612]]
Accuracy: 0.78625
Precision: 0.7861122702713685
Recall: 0.7862339518324346
F-score: 0.7861534349104519
vec = CountVectorizer(analyzer='char_wb', ngram_range=(2, 10))
clf = Perceptron()
test_Y_pred_counts_wbchar = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_counts_wbchar)
Confusion matrix:
[[615 206]
[176 603]]
Accuracy: 0.76125
Precision: 0.7614307435784842
Recall: 0.7615778997715613
F-score: 0.7612365695570376
from sklearn.feature_extraction.text import HashingVectorizer
vec = HashingVectorizer()
clf = Perceptron()
test_Y_pred_hashing = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_hashing)
Confusion matrix:
[[670 151]
[257 522]]
Accuracy: 0.745
Precision: 0.7491965486454732
Recall: 0.7430839062541532
F-score: 0.7427993267394141
from sklearn.feature_extraction.text import HashingVectorizer
vec = HashingVectorizer(n_features=2**4)
clf = Perceptron()
test_Y_pred_hashing_2p4 = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_hashing_2p4)
Confusion matrix:
[[476 345]
[408 371]]
Accuracy: 0.529375
Precision: 0.528308981521272
Recall: 0.5280161798989617
F-score: 0.5273394207475555
from sklearn.feature_extraction.text import HashingVectorizer
vec = HashingVectorizer(n_features=(2**31 - 1))
clf = Perceptron()
test_Y_pred_hashing_2p31 = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, test_Y_pred_hashing_2p31)
Confusion matrix:
[[622 199]
[200 579]]
Accuracy: 0.750625
Precision: 0.7504534679351259
Recall: 0.7504366289896631
F-score: 0.7504447548248715
overviewtable = pd.DataFrame(data={
'TF-IDF': [
round(accuracy_score(test_Y, test_Y_pred_Tfidf), 4),
round(precision_score(test_Y, test_Y_pred_Tfidf, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_Tfidf, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_Tfidf, average='macro'), 4)
] ,
'Counts': [
round(accuracy_score(test_Y, test_Y_pred_counts), 4),
round(precision_score(test_Y, test_Y_pred_counts, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_counts, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_counts, average='macro'), 4)
],
'Counts ngrams=4,4': [
round(accuracy_score(test_Y, test_Y_pred_counts_ng44), 4),
round(precision_score(test_Y, test_Y_pred_counts_ng44, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_counts_ng44, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_counts_ng44, average='macro'), 4)
],
'Counts ngrams=2,4': [
round(accuracy_score(test_Y, test_Y_pred_counts_ng24), 4),
round(precision_score(test_Y, test_Y_pred_counts_ng24, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_counts_ng24, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_counts_ng24, average='macro'), 4)
],
'Counts char': [
round(accuracy_score(test_Y, test_Y_pred_counts_char), 4),
round(precision_score(test_Y, test_Y_pred_counts_char, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_counts_char, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_counts_char, average='macro'), 4)
],
'Counts wb_char': [
round(accuracy_score(test_Y, test_Y_pred_counts_wbchar), 4),
round(precision_score(test_Y, test_Y_pred_counts_wbchar, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_counts_wbchar, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_counts_wbchar, average='macro'), 4)
],
'Hashing': [
round(accuracy_score(test_Y, test_Y_pred_hashing), 4),
round(precision_score(test_Y, test_Y_pred_hashing, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_hashing, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_hashing, average='macro'), 4)
],
'Hashing 2^4': [
round(accuracy_score(test_Y, test_Y_pred_hashing_2p4), 4),
round(precision_score(test_Y, test_Y_pred_hashing_2p4, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_hashing_2p4, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_hashing_2p4, average='macro'), 4)
],
'Hashing 2^31-1': [
round(accuracy_score(test_Y, test_Y_pred_hashing_2p31), 4),
round(precision_score(test_Y, test_Y_pred_hashing_2p31, average='macro'), 4),
round(recall_score(test_Y, test_Y_pred_hashing_2p31, average='macro'), 4),
round(f1_score(test_Y, test_Y_pred_hashing_2p31, average='macro'), 4)
]
}, index=['Accuracy', 'Precision', 'Recall', 'F-score'])
overviewtable.transpose().head(10)
train_books_X = books_train['text']
train_books_Y = books_train['polarity']
test_books_X = books_test['text']
test_books_Y = books_test['polarity']
train_dvd_X = dvd_train['text']
train_dvd_Y = dvd_train['polarity']
test_dvd_X = dvd_test['text']
test_dvd_Y = dvd_test['polarity']
train_electronics_X = electronics_train['text']
train_electronics_Y = electronics_train['polarity']
test_electronics_X = electronics_test['text']
test_electronics_Y = electronics_test['polarity']
train_kitchen_X = kitchen_train['text']
train_kitchen_Y = kitchen_train['polarity']
test_kitchen_X = kitchen_test['text']
test_kitchen_Y = kitchen_test['polarity']
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction.text import CountVectorizer
books_clf = Perceptron()
dvd_clf = Perceptron()
electronics_clf = Perceptron()
kitchen_clf = Perceptron()
vec = CountVectorizer()
books_Y_pred = run_classifier_pipe(vec, books_clf, train_books_X, train_books_Y, test_books_X)
print_scores(test_books_Y, books_Y_pred)
Confusion matrix:
[[137 60]
[ 44 159]]
Accuracy: 0.74
Precision: 0.7414667373041701
Recall: 0.7393413518041559
F-score: 0.7392111136186965
dvd_Y_pred = run_classifier_pipe(vec, dvd_clf, train_dvd_X, train_dvd_Y, test_dvd_X)
print_scores(test_dvd_Y, dvd_Y_pred)
Confusion matrix:
[[136 61]
[ 39 164]]
Accuracy: 0.75
Precision: 0.753015873015873
Recall: 0.7491185516741267
F-score: 0.7487689679429204
electronics_Y_pred = run_classifier_pipe(vec, electronics_clf, train_electronics_X, train_electronics_Y, test_electronics_X)
print_scores(test_electronics_Y, electronics_Y_pred)
Confusion matrix:
[[162 51]
[ 55 132]]
Accuracy: 0.735
Precision: 0.7339276271058397
Recall: 0.7332228666114333
F-score: 0.7335009428032684
kitchen_Y_pred = run_classifier_pipe(vec, kitchen_clf, train_kitchen_X, train_kitchen_Y, test_kitchen_X)
print_scores(test_kitchen_Y, kitchen_Y_pred)
Confusion matrix:
[[171 43]
[ 46 140]]
Accuracy: 0.7775
Precision: 0.7765228777920475
Recall: 0.7758767963018792
F-score: 0.7761555341771516
books_clf_iter_max_changes = Perceptron(max_iter=2000)
books_Y2_pred = run_classifier_pipe(vec, books_clf_iter_max_changes, train_books_X, train_books_Y, test_books_X)
print_scores(test_books_Y, books_Y2_pred)
Confusion matrix:
[[137 60]
[ 44 159]]
Accuracy: 0.74
Precision: 0.7414667373041701
Recall: 0.7393413518041559
F-score: 0.7392111136186965
books_clf_iter_min_changes = Perceptron(max_iter=5)
books_Y3_pred = run_classifier_pipe(vec, books_clf_iter_min_changes, train_books_X, train_books_Y, test_books_X)
print_scores(test_books_Y, books_Y3_pred)
Confusion matrix:
[[101 96]
[ 36 167]]
Accuracy: 0.67
Precision: 0.6861036329827093
Recall: 0.6676752269260584
F-score: 0.6607643082932846
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_stochastic_gradient.py:700: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
ConvergenceWarning,
from sklearn.neighbors import KNeighborsClassifier
domain_train_X = mixed_train['text']
domain_train_Y = mixed_train['domain']
domain_test_X = mixed_test['text']
domain_test_Y = mixed_test['domain']
vec = CountVectorizer(analyzer='char_wb', ngram_range=(2,5), stop_words='english', max_df=0.95)
clf = KNeighborsClassifier()
domain_Y_pred = run_classifier_pipe(vec, clf, domain_train_X, domain_train_Y, domain_test_X)
print_scores(domain_test_Y, domain_Y_pred)
Confusion matrix:
[[295 63 22 20]
[ 72 254 45 29]
[ 38 45 235 82]
[ 47 54 88 211]]
Accuracy: 0.621875
Precision: 0.620688739306287
Recall: 0.621875
F-score: 0.6196767862545537
clf_2 = KNeighborsClassifier(n_neighbors=2)
domain_Y2_pred = run_classifier_pipe(vec, clf_2, domain_train_X, domain_train_Y, domain_test_X)
print_scores(domain_test_Y, domain_Y2_pred)
Confusion matrix:
[[322 48 19 11]
[128 228 33 11]
[ 83 75 207 35]
[ 92 64 117 127]]
Accuracy: 0.5525
Precision: 0.5763367241398526
Recall: 0.5525
F-score: 0.5390596367173268
clf_7 = KNeighborsClassifier(n_neighbors=7)
domain_Y7_pred = run_classifier_pipe(vec, clf_7, domain_train_X, domain_train_Y, domain_test_X)
print_scores(domain_test_Y, domain_Y7_pred)
Confusion matrix:
[[299 53 26 22]
[ 69 260 36 35]
[ 38 57 223 82]
[ 33 51 90 226]]
Accuracy: 0.63
Precision: 0.628128835021385
Recall: 0.63
F-score: 0.6281151890271177
clf_10 = KNeighborsClassifier(n_neighbors=10, weights='distance')
domain_Y10_pred = run_classifier_pipe(vec, clf_10, domain_train_X, domain_train_Y, domain_test_X)
print_scores(domain_test_Y, domain_Y10_pred)
Confusion matrix:
[[282 62 30 26]
[ 37 267 47 49]
[ 28 49 221 102]
[ 17 51 73 259]]
Accuracy: 0.643125
Precision: 0.6467067314717478
Recall: 0.643125
F-score: 0.6438170376579524
clf_15 = KNeighborsClassifier(n_neighbors=15)
domain_Y15_pred = run_classifier_pipe(vec, clf_15, domain_train_X, domain_train_Y, domain_test_X)
print_scores(domain_test_Y, domain_Y15_pred)
Confusion matrix:
[[289 62 30 19]
[ 54 259 44 43]
[ 27 67 224 82]
[ 26 54 87 233]]
Accuracy: 0.628125
Precision: 0.6289065368933523
Recall: 0.628125
F-score: 0.6279439475472841
clf_20 = KNeighborsClassifier(n_neighbors=20)
domain_Y20_pred = run_classifier_pipe(vec, clf_20, domain_train_X, domain_train_Y, domain_test_X)
print_scores(domain_test_Y, domain_Y20_pred)
Confusion matrix:
[[286 69 22 23]
[ 47 268 46 39]
[ 21 70 220 89]
[ 20 61 91 228]]
Accuracy: 0.62625
Precision: 0.6298533756239794
Recall: 0.62625
F-score: 0.6266805407874372
from sklearn.naive_bayes import MultinomialNB
bayes_train_X = mixed_train['text']
bayes_train_Y = mixed_train['rating']
bayes_test_X = mixed_test['text']
bayes_test_Y = mixed_test['rating']
vec = CountVectorizer()
clf_bayes = MultinomialNB()
bayes_pred_Y = run_classifier_pipe(vec, clf_bayes, bayes_train_X, bayes_train_Y, bayes_test_X)
print_scores(bayes_test_Y, bayes_pred_Y)
print(f"Micro-average precision: {precision_score(bayes_test_Y, bayes_pred_Y, average='micro')}")
Confusion matrix:
[[349 55 2 104]
[133 76 2 100]
[ 46 29 4 154]
[ 70 37 7 432]]
Accuracy: 0.538125
Precision: 0.44572523796721936
Recall: 0.4342657222567335
F-score: 0.40203528717116016
Micro-average precision: 0.538125
clf = MultinomialNB()
bayes_train_Y = mixed_train['polarity']
bayes_test_Y = mixed_test['polarity']
bayes_pred_Y = run_classifier_pipe(vec, clf, bayes_train_X, bayes_train_Y, bayes_test_X)
print_scores(bayes_test_Y, bayes_pred_Y)
Confusion matrix:
[[634 187]
[203 576]]
Accuracy: 0.75625
Precision: 0.7561909772623001
Recall: 0.7558192441979552
F-score: 0.7559292758139993
from sklearn.tree import DecisionTreeClassifier
train_X = mixed_train['text']
train_Y = mixed_train['polarity']
test_X = mixed_test['text']
test_Y = mixed_test['polarity']
vec = CountVectorizer()
clf = DecisionTreeClassifier()
Y_pred = run_classifier_pipe(vec, clf, train_X, train_Y, test_X)
print_scores(test_Y, Y_pred)
Confusion matrix:
[[533 288]
[273 506]]
Accuracy: 0.649375
Precision: 0.6492849597789876
Recall: 0.6493794943077965
F-score: 0.6492751256119419
clf_r = DecisionTreeClassifier(splitter="random")
Y_pred_r = run_classifier_pipe(vec, clf_r, train_X, train_Y, test_X)
print_scores(test_Y, Y_pred_r)
Confusion matrix:
[[548 273]
[252 527]]
Accuracy: 0.671875
Precision: 0.671875
Recall: 0.6719935142809341
F-score: 0.6718184656028636
clf_c = DecisionTreeClassifier(criterion="entropy")
Y_pred_c = run_classifier_pipe(vec, clf_c, train_X, train_Y, test_X)
print_scores(test_Y, Y_pred_c)
Confusion matrix:
[[526 295]
[288 491]]
Accuracy: 0.635625
Precision: 0.6354367900169426
Recall: 0.6354886726635072
F-score: 0.6354505573956288
clf_ms = DecisionTreeClassifier(min_samples_split=10, min_samples_leaf=10)
Y_pred_ms = run_classifier_pipe(vec, clf_ms, train_X, train_Y, test_X)
print_scores(test_Y, Y_pred_ms)
Confusion matrix:
[[527 294]
[271 508]]
Accuracy: 0.646875
Precision: 0.6469087306795667
Recall: 0.6470091109655247
F-score: 0.6468251968343973
clf_md = DecisionTreeClassifier(criterion="entropy", max_depth=2)
Y_pred_md = run_classifier_pipe(vec, clf_md, train_X, train_Y, test_X)
print_scores(test_Y, Y_pred_md)
Confusion matrix:
[[767 54]
[632 147]]
Accuracy: 0.57125
Precision: 0.6397960163442971
Recall: 0.561465009483097
F-score: 0.49549549549549554
clf_cw = DecisionTreeClassifier(class_weight={"positive": 0.01, "negative": 0.99})
Y_pred_cw = run_classifier_pipe(vec, clf_cw, train_X, train_Y, test_X)
print_scores(test_Y, Y_pred_cw)
Confusion matrix:
[[573 248]
[417 362]]
Accuracy: 0.584375
Precision: 0.5861152508693492
Recall: 0.581313842819818
F-score: 0.5770189295955164
from sklearn.svm import SVC
train_X = mixed_train["text"]
train_Y = mixed_train["polarity"]
test_X = mixed_test["text"]
test_Y = mixed_test["polarity"]
vec = CountVectorizer()
clf_linear_c_1 = SVC(kernel="linear") # 1 is the default parameter for C
clf_linear_c_2 = SVC(kernel="linear", C=2.0)
clf_linear_c_4 = SVC(kernel="linear", C=4.0)
clf_linear_c_8 = SVC(kernel="linear", C=8.0)
clf_linear_c_128 = SVC(kernel="linear", C=128.0)
y_pred_lin_1 = run_classifier_pipe(vec, clf_linear_c_1, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_lin_1)
Confusion matrix:
[[617 204]
[212 567]]
Accuracy: 0.74
Precision: 0.7398393826888146
Recall: 0.7396893797132087
F-score: 0.7397458455522972
y_pred_lin_2 = run_classifier_pipe(vec, clf_linear_c_2, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_lin_2)
Confusion matrix:
[[609 212]
[220 559]]
Accuracy: 0.73
Precision: 0.7298262247734915
Recall: 0.7296824843368634
F-score: 0.7297360703812317
y_pred_lin_4 = run_classifier_pipe(vec, clf_linear_c_4, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_lin_4)
Confusion matrix:
[[581 240]
[215 564]]
Accuracy: 0.715625
Precision: 0.715696017400435
Recall: 0.7158393518033519
F-score: 0.715592893103948
y_pred_lin_8 = run_classifier_pipe(vec, clf_linear_c_8, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_lin_8)
Confusion matrix:
[[575 246]
[215 564]]
Accuracy: 0.711875
Precision: 0.7120721987810595
Recall: 0.7121852714135835
F-score: 0.7118613809480838
y_pred_lin_128 = run_classifier_pipe(vec, clf_linear_c_128, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_lin_128)
Confusion matrix:
[[546 275]
[198 581]]
Accuracy: 0.704375
Precision: 0.7063046427494724
Recall: 0.7054353077667581
F-score: 0.7042334710945667
clf_rbf_1_scale = SVC() # default kernel is rbf already
clf_rbf_2_scale = SVC(C=2.0)
clf_rbf_128_scale = SVC(C=128.0)
clf_rbf_1_auto = SVC(gamma="auto")
clf_rbf_128_auto = SVC(C=128.0, gamma="auto")
clf_rbf_1_4 = SVC(gamma=4.0)
clf_rbf_128_128 = SVC(C=128.0, gamma=128.0)
clf_rbf_1_05 = SVC(C=1, gamma=0.5)
y_pred_rbf_1_scale = run_classifier_pipe(vec, clf_rbf_1_scale, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_1_scale)
Confusion matrix:
[[633 188]
[211 568]]
Accuracy: 0.750625
Precision: 0.7506613756613756
Recall: 0.7500754426096732
F-score: 0.7502127534700824
y_pred_rbf_2_scale = run_classifier_pipe(vec, clf_rbf_2_scale, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_2_scale)
Confusion matrix:
[[645 176]
[220 559]]
Accuracy: 0.7525
Precision: 0.7531044787857339
Recall: 0.7516069666754748
F-score: 0.7517828852418942
y_pred_rbf_128_scale = run_classifier_pipe(vec, clf_rbf_128_scale, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_128_scale)
Confusion matrix:
[[593 228]
[211 568]]
Accuracy: 0.725625
Precision: 0.7255650141253531
Recall: 0.7257149066778827
F-score: 0.7255579975579977
y_pred_rbf_1_auto = run_classifier_pipe(vec, clf_rbf_1_auto, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_1_auto)
Confusion matrix:
[[ 67 754]
[ 28 751]]
Accuracy: 0.51125
Precision: 0.6021332400769366
Recall: 0.522832139020794
F-score: 0.4019532116336161
y_pred_rbf_128_auto = run_classifier_pipe(vec, clf_rbf_128_auto, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_128_auto)
Confusion matrix:
[[623 198]
[187 592]]
Accuracy: 0.759375
Precision: 0.7592514455383654
Recall: 0.7593896731966872
F-score: 0.7592846377097028
y_pred_rbf_1_4 = run_classifier_pipe(vec, clf_rbf_1_4, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_1_4)
Confusion matrix:
[[ 35 786]
[ 0 779]]
Accuracy: 0.50875
Precision: 0.7488817891373802
Recall: 0.5213154689403167
F-score: 0.37322573442633405
y_pred_rbf_128_128 = run_classifier_pipe(vec, clf_rbf_128_128, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_128_128)
Confusion matrix:
[[ 35 786]
[ 0 779]]
Accuracy: 0.50875
Precision: 0.7488817891373802
Recall: 0.5213154689403167
F-score: 0.37322573442633405
y_pred_rbf_1_05 = run_classifier_pipe(vec, clf_rbf_1_05, train_X, train_Y, test_X)
print_scores(test_Y, y_pred_rbf_1_05)
Confusion matrix:
[[ 38 783]
[ 1 778]]
Accuracy: 0.51
Precision: 0.7363787184414987
Recall: 0.5225006606114526
F-score: 0.37666467899026035
books_train_X = books_train['text']
books_train_Y = books_train['polarity']
dvd_train_X = dvd_train["text"]
dvd_train_Y = dvd_train["polarity"]
electronics_train_X = electronics_train["text"]
electronics_train_Y = electronics_train["polarity"]
kitchen_train_X = kitchen_train["text"]
kitchen_train_X = kitchen_train["polarity"]
books_test_X = books_test['text']
books_test_Y = books_test['polarity']
dvd_test_X = dvd_test["text"]
dvd_test_Y = dvd_test["polarity"]
electronics_test_X = electronics_test["text"]
electronics_test_Y = electronics_test["polarity"]
kitchen_test_X = kitchen_test["text"]
kitchen_test_X = kitchen_test["polarity"]
train_books_X = books_train['text']
train_books_Y = books_train['polarity']
test_books_X = books_test['text']
test_books_Y = books_test['polarity']
train_dvd_X = dvd_train['text']
train_dvd_Y = dvd_train['polarity']
test_dvd_X = dvd_test['text']
test_dvd_Y = dvd_test['polarity']
train_electronics_X = electronics_train['text']
train_electronics_Y = electronics_train['polarity']
test_electronics_X = electronics_test['text']
test_electronics_Y = electronics_test['polarity']
train_kitchen_X = kitchen_train['text']
train_kitchen_Y = kitchen_train['polarity']
test_kitchen_X = kitchen_test['text']
test_kitchen_Y = kitchen_test['polarity']
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Perceptron
books_clf = Perceptron()
dvd_clf = Perceptron()
electronics_clf = Perceptron()
kitchen_clf = Perceptron()
vec = CountVectorizer()
vec2 = CountVectorizer()
vec3 = CountVectorizer()
vec4 = CountVectorizer()
def create_train_pipeline(vec, clf, train_X, train_Y):
pipeline = Pipeline([
('vectorizer', vec),
('classifier', clf)
])
pipeline.fit(train_X, train_Y)
return pipeline
books_model = create_train_pipeline(vec, books_clf, books_train_X, books_train_Y)
dvd_model = create_train_pipeline(vec2, dvd_clf, dvd_train_X, dvd_train_Y)
electronics_model = create_train_pipeline(vec3, electronics_clf, electronics_train_X, electronics_train_Y)
kitchen_model = create_train_pipeline(vec4, kitchen_clf, kitchen_train_X, kitchen_train_X)
b_books_y_pred = books_model.predict(books_test_X)
b_dvd_y_pred = books_model.predict(dvd_test_X)
b_elec_y_pred = books_model.predict(electronics_test_X)
b_kitchen_y_pred = books_model.predict(kitchen_test_X)
d_books_y_pred = dvd_model.predict(books_test_X)
d_dvd_y_pred = dvd_model.predict(dvd_test_X)
d_elec_y_pred = dvd_model.predict(electronics_test_X)
d_kitchen_y_pred = dvd_model.predict(kitchen_test_X)
e_books_y_pred = electronics_model.predict(books_test_X)
e_dvd_y_pred = electronics_model.predict(dvd_test_X)
e_elec_y_pred = electronics_model.predict(electronics_test_X)
e_kitchen_y_pred = electronics_model.predict(kitchen_test_X)
k_books_y_pred = kitchen_model.predict(books_test_X)
k_dvd_y_pred = kitchen_model.predict(dvd_test_X)
k_elec_y_pred = kitchen_model.predict(electronics_test_X)
k_kitchen_y_pred = kitchen_model.predict(kitchen_test_X)
mixed_train_X = mixed_train["text"]
mixed_train_Y = mixed_train["polarity"]
mixed_test_X = mixed_test["text"]
mixed_test_Y = mixed_test["polarity"]
clf_mixed = Perceptron()
mixed_model = create_train_pipeline(vec, clf_mixed, mixed_train_X, mixed_train_Y)
m_books_y_pred = mixed_model.predict(books_test_X)
m_dvd_y_pred = mixed_model.predict(dvd_test_X)
m_elec_y_pred = mixed_model.predict(electronics_test_X)
m_kitchen_y_pred = mixed_model.predict(kitchen_test_X)
m_mixed_y_pred = mixed_model.predict(mixed_test_Y)