import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import matplotlib as plt
with open('fantasy_10000.json') as fp:
dumped_dict = fp.read()
with open('clean_fantasy_10000.json', 'w') as fp:
fp.write('[\n'+',\n'.join(dumped_dict.splitlines())+'\n]')
fantasy_data = pd.read_json('clean_fantasy_10000.json')
fantasy_data
relation_rating_length = fantasy_data[['rating', 'review_text']]
fantasy_data['review_text_length'] =fantasy_data.review_text.str.len()
fantasy_data
fantasy_data[['rating', 'review_text_length']]
axe = fantasy_data.plot( x='review_text_length', y='rating', kind='scatter', title="Review length according to rating")
axe.set_ylabel('Review rating [stars]')
axe.set_xlabel('Review text length [character count]')
lg = LinearRegression(copy_X=True)
model = lg.fit(fantasy_data['review_text_length'][:, None], fantasy_data['rating'])
results = pd.concat([fantasy_data['review_text_length'], fantasy_data['rating']], axis=1)
results['predicted_rating'] = model.predict(fantasy_data['review_text_length'][:, None])
results
pd.DataFrame(data=[{'MSE': mean_squared_error(results['rating'], results['predicted_rating']), "theta0": model.intercept_, "theta1": model.coef_[0]}])
lg = LinearRegression(copy_X=True)
model = lg.fit(fantasy_data[['review_text_length', 'n_comments']], fantasy_data['rating'])
results = fantasy_data[['review_text_length', 'n_comments', 'rating']].copy()
results['predicted_rating'] = model.predict(fantasy_data[['review_text_length', 'n_comments']])
results
pd.DataFrame(data=[{'MSE': mean_squared_error(results['rating'], results['predicted_rating']), "theta0": model.intercept_, "theta1": model.coef_[0], "theta2": model.coef_[1]}])
train_data = pd.DataFrame()
train_data['length1'] = MinMaxScaler().fit_transform(fantasy_data['review_text_length'][:, None])[:, 0]
for i in range(2, 6):
train_data[f'length{i}'] = train_data.length1 ** i
train_data['rating'] = fantasy_data.rating
train_data
results = pd.DataFrame()
for i in range(1, 6):
lg = LinearRegression(copy_X=True)
model = lg.fit(train_data[[f'length{j}' for j in range(1,i+1)]], train_data['rating'])
results = results.append(pd.Series({'max_polynomial_degree': i, 'MSE': mean_squared_error(train_data['rating'], model.predict(train_data[[f'length{j}' for j in range(1,i+1)]]))}, name=f'Degree {i}'))
results
results = pd.DataFrame()
for i in range(1, 6):
lg = LinearRegression(copy_X=True)
X_train, X_test, y_train, y_test = train_test_split(train_data[[f'length{j}' for j in range(1,i+1)]], train_data['rating'], train_size=0.5, test_size=0.5)
model = lg.fit(X_train, y_train)
results = results.append(pd.Series({'max_polynomial_degree': i, 'MSE Test': mean_squared_error(y_test, model.predict(X_test)), 'MSE Train': mean_squared_error(y_test, model.predict(X_train))}, name=f'Degree {i}'))
results
import ast
import json
with open('beer_50000.json') as fp:
dumped_dict = fp.read()
with open('clean_beer_50000.json', 'w') as fp:
items = []
for line in dumped_dict.splitlines():
items.append(ast.literal_eval(line))
json.dump(items, fp)
beers = pd.read_json('clean_beer_50000.json')
beers = beers[beers['user/gender'].notna()]
beers
beers['review/length'] = beers['review/text'].str.len()
data = beers[["review/length"]]
labels = (beers['user/gender'] == 'Male') + 0
X_train, X_test, y_train, y_test = train_test_split(data, labels)
logistic = LogisticRegression()
model = logistic.fit(X_train, y_train)
conf_mat_train = confusion_matrix(y_train, model.predict(X_train))
conf_mat_train
axes = plt.pyplot.matshow(conf_mat_train, cmap="gray")
plt.pyplot.xticks([0, 1], ['False', 'True'])
plt.pyplot.yticks([0, 1], ['Negaive', 'Postive'])
conf_mat_test = confusion_matrix(y_test, model.predict(X_test))
conf_mat_test
axes = plt.pyplot.matshow(conf_mat_test, cmap="gray")
plt.pyplot.xticks([0, 1], ['False', 'True'])
plt.pyplot.yticks([0, 1], ['Negaive', 'Postive'])
logistic = LogisticRegression(class_weight='balanced')
model = logistic.fit(X_train, y_train)
conf_mat_train = confusion_matrix(y_train, model.predict(X_train))
conf_mat_train
axes = plt.pyplot.matshow(conf_mat_train, cmap="gray")
plt.pyplot.xticks([0, 1], ['False', 'True'])
plt.pyplot.yticks([0, 1], ['Negaive', 'Postive'])
conf_mat_test = confusion_matrix(y_test, model.predict(X_test))
conf_mat_test
axes = plt.pyplot.matshow(conf_mat_test, cmap="gray")
plt.pyplot.xticks([0, 1], ['False', 'True'])
plt.pyplot.yticks([0, 1], ['Negaive', 'Postive'])
data = pd.DataFrame()
data['beer/ABV'] = beers['beer/ABV']
data['review/length'] = beers['review/text'].str.len()
ohe_styles = OneHotEncoder().fit_transform(beers['beer/style'][:,None])
ohe_styles.toarray()
data = pd.concat((data, pd.DataFrame(ohe_styles.toarray(), index=data.index)), axis=1)
data
X_train, X_test, y_train, y_test = train_test_split(data, beers["user/gender"])
logistic = LogisticRegression(class_weight='balanced', max_iter=1000)
model = logistic.fit(X_train, y_train)
conf_mat_train = confusion_matrix(y_train, model.predict(X_train))
conf_mat_train
axes = plt.pyplot.matshow(conf_mat_train, cmap="gray")
plt.pyplot.xticks([0, 1], ['False', 'True'])
plt.pyplot.yticks([0, 1], ['Negaive', 'Postive'])
conf_mat_test = confusion_matrix(y_test, model.predict(X_test))
conf_mat_test
axes = plt.pyplot.matshow(conf_mat_test, cmap="gray")
plt.pyplot.xticks([0, 1], ['False', 'True'])
plt.pyplot.yticks([0, 1], ['Negaive', 'Postive'])