def data_preperation(questions, answers, professionals):
test_mode = True
# Merge datasets
answers = answers.merge(professionals, left_on='answers_author_id', right_on='professionals_id', how="left")
df = answers.merge(questions,left_on='answers_question_id', right_on='questions_id', how="left")
print(df.shape)
# only load a few lines in test mode
if test_mode:
df = df.head(10000)
answers_df = df
# Define event strength as the answered amount to a certain question
answers_df['eventStrength'] = 1
def smooth_professional_preference(x):
return x
answers_full_df = answers_df \
.groupby(['professionals_id', 'questions_id'])['eventStrength'].sum() \
.apply(smooth_professional_preference).reset_index()
# Update questions dataset
question_cols = questions.columns
questions = df[question_cols].drop_duplicates()
print('# of questions: %d' % len(questions))
print('# of unique user/question answers: %d' % len(answers_full_df))
return (questions, answers_full_df)
def split_train_test_data(answers_full_df):
answers_train_df, answers_test_df = train_test_split(answers_full_df,
test_size=0.20,
random_state=42)
print('# answers on Train set: %d' % len(answers_train_df))
print('# answers on Test set: %d' % len(answers_test_df))
answers_full_indexed_df = answers_full_df.set_index('professionals_id')
answers_test_indexed_df = answers_test_df.set_index('professionals_id')
answers_train_indexed_df = answers_train_df.set_index('professionals_id')
return (answers_train_df, answers_test_df, answers_full_indexed_df, answers_test_indexed_df, answers_train_indexed_df)
def process_text_using_tfidf(questions):
# Preprocessing of text data
textfeats = ["questions_title","questions_body"]
for cols in textfeats:
questions[cols] = questions[cols].astype(str)
questions[cols] = questions[cols].astype(str).fillna('') # FILL NA
questions[cols] = questions[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
text = questions["questions_title"] + ' ' + questions["questions_body"]
vectorizer = TfidfVectorizer(strip_accents='unicode',
analyzer='word',
lowercase=True, # Convert all uppercase to lowercase
stop_words='english', # Remove commonly found english words ('it', 'a', 'the') which do not typically contain much signal
max_df = 0.9, # Only consider words that appear in fewer than max_df percent of all documents
# max_features=5000 # Maximum features to be extracted
)
question_ids = questions['questions_id'].tolist()
tfidf_matrix = vectorizer.fit_transform(text)
tfidf_feature_names = vectorizer.get_feature_names()
tfidf_matrix
return (tfidf_matrix, tfidf_feature_names, question_ids)
def bert_setup():
# Install bert-as-service
!pip install bert-serving-server
!pip install bert-serving-client
# Download and unzip the pre-trained model
!wget http://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip
# Start the BERT server
bert_command = 'bert-serving-start -model_dir /kaggle/working/uncased_L-12_H-768_A-12'
process = subprocess.Popen(bert_command.split(), stdout=subprocess.PIPE)
def combine_two_list(x,y):
z = []
for i in range(len(x)):
z.append(x[i]+y[i])
return z
def process_text_using_bert_embeddings(questions):
title_embeddings = bc.encode(questions["questions_title"].tolist())
body_embeddings = bc.encode(questions["questions_body"].tolist())
question_embeddings = np.asarray(combine_two_list(title_embeddings.tolist(),body_embeddings.tolist()))
return scipy.sparse.csr_matrix(question_embeddings)
def get_question_profile(question_id, processed_text_matrix):
idx = question_ids.index(question_id)
question_profile = processed_text_matrix[idx:idx+1]
return question_profile
def get_question_profiles(ids, processed_text_matrix):
question_profiles_list = [get_question_profile(x, processed_text_matrix) for x in np.ravel([ids])]
question_profiles = scipy.sparse.vstack(question_profiles_list)
return question_profiles
def build_professionals_profile(professional_id, answers_indexed_df, processed_text_matrix):
answers_professional_df = answers_indexed_df.loc[professional_id]
professional_question_profiles = get_question_profiles(answers_professional_df['questions_id'], processed_text_matrix)
professional_question_strengths = np.array(answers_professional_df['eventStrength']).reshape(-1,1)
#Weighted average of question profiles by the answers strength
professional_question_strengths_weighted_avg = np.sum(professional_question_profiles.multiply(professional_question_strengths), axis=0) / (np.sum(professional_question_strengths)+1)
professional_profile_norm = sklearn.preprocessing.normalize(professional_question_strengths_weighted_avg)
return professional_profile_norm
from tqdm import tqdm
def build_professionals_profiles(answers_full_df, processed_text_matrix):
answers_indexed_df = answers_full_df[answers_full_df['questions_id'].isin(questions['questions_id'])].set_index('professionals_id')
professional_profiles = {}
for professional_id in tqdm(answers_indexed_df.index.unique()):
professional_profiles[professional_id] = build_professionals_profile(professional_id, answers_indexed_df, processed_text_matrix)
print("# of professionals with profiles: %d" % len(professional_profiles))
return professional_profiles
myprofessional1 = "000d4635e5da41e3bfd83677ee11dda4"
myprofessional2 = "00271cc10e0245fba4a35e76e669c281"
cbr_model_tfidf = ContentBasedRecommender(questions)
cbr_model_tfidf.recommend_questions(myprofessional2, tfidf_matrix)