import time
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import HashingVectorizer
from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
from tokenwiser.pipeline import make_partial_pipeline, make_partial_union
df = pd.read_csv("data/imdb-reviews.csv")
df_amazon = pd.read_csv("data/amazon-reviews.csv")
def create_model_pipeline():
featurizers = []
# This is where we create HashingVecotorizers with different sizes.
# We need to have different sizes to ensure different hash functions are used.
for size in [1998, 1999, 2000, 2001]:
featurizers.append(
HashingVectorizer(n_features=size, binary=True, norm=None)
)
return make_partial_pipeline(
make_partial_union(*featurizers),
SGDClassifier()
)
from memo import grid, memlist, memfunc, time_taken, Runner
data = []
@memlist(data=data)
@memfunc(print)
@time_taken()
def run_simple_benchmark(pretrain=0, epochs=50, n_train=1000):
"""
pretrain: how many batches of sampled data to pre-train on from the amazon data
epochs: how many batches of sampled data to train on from the imdb data
n_train: how large of a train set do we have from the imdb data
"""
# pretend that we have a smaller train set and create model
df_train, df_valid = df.head(n_train), df.tail(10000)
pipe = create_model_pipeline()
tic = time.time()
# pretrain on the amazon dataset
for i in range(pretrain):
sample = df_amazon.sample(250)
pipe.partial_fit(list(sample['text']), sample['label'], classes=['positive', 'negative'])
# train on the imdb dataset
for i in range(epochs):
sample = df_train.sample(250)
pipe.partial_fit(list(sample['text']), sample['label'], classes=['positive', 'negative'])
toc = time.time()
# calculate the score that we care about
score = np.mean(pipe.predict(list(df_valid['text'])) == df_valid['label'])
return {
'score': score,
'train_time': int(toc - tic)
}
settings = grid(pretrain=[0, 50, 100], epochs=[10, 20, 50, 100], n_train=[300, 500, 1000, 2000])
for setting in settings:
run_simple_benchmark(**setting)
pd.DataFrame(data).to_csv("results.csv", index=False)
pd.read_csv("results.csv").pivot(["pretrain", "epochs"], "n_train", values="score").reset_index()
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from tokenwiser.pipeline import make_partial_pipeline, make_partial_union
df = pd.read_csv("data/imdb-reviews.csv")
df_train, df_valid = df.head(2000), df.tail(10000)
pipe = make_partial_pipeline(CountVectorizer(), LogisticRegression(max_iter=1000))
pipe.fit(df_train['text'], df_train['label'])
np.mean(pipe.predict(list(df_valid['text'])) == df_valid['label'])
df_train, df_valid = df.head(40000), df.tail(10000)
pipe = make_partial_pipeline(CountVectorizer(), LogisticRegression(max_iter=2000))
pipe.fit(df_train['text'], df_train['label'])
np.mean(pipe.predict(list(df_valid['text'])) == df_valid['label'])
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from whatlies.language import UniversalSentenceLanguage
pipe = make_partial_pipeline(
make_partial_union(
CountVectorizer(),
UniversalSentenceLanguage()
),
LogisticRegression())
pipe.fit(list(df_train['text']), df_train['label'])
np.mean(pipe.predict(list(df_valid['text'])) == df_valid['label'])