CGS

def data_preperation(questions, answers, professionals): test_mode = True # Merge datasets answers = answers.merge(professionals, left_on='answers_author_id', right_on='professionals_id', how="left") df = answers.merge(questions,left_on='answers_question_id', right_on='questions_id', how="left") print(df.shape) # only load a few lines in test mode if test_mode: df = df.head(10000) answers_df = df # Define event strength as the answered amount to a certain question answers_df['eventStrength'] = 1 def smooth_professional_preference(x): return x answers_full_df = answers_df \ .groupby(['professionals_id', 'questions_id'])['eventStrength'].sum() \ .apply(smooth_professional_preference).reset_index() # Update questions dataset question_cols = questions.columns questions = df[question_cols].drop_duplicates() print('# of questions: %d' % len(questions)) print('# of unique user/question answers: %d' % len(answers_full_df)) return (questions, answers_full_df)

def split_train_test_data(answers_full_df): answers_train_df, answers_test_df = train_test_split(answers_full_df, test_size=0.20, random_state=42) print('# answers on Train set: %d' % len(answers_train_df)) print('# answers on Test set: %d' % len(answers_test_df)) answers_full_indexed_df = answers_full_df.set_index('professionals_id') answers_test_indexed_df = answers_test_df.set_index('professionals_id') answers_train_indexed_df = answers_train_df.set_index('professionals_id') return (answers_train_df, answers_test_df, answers_full_indexed_df, answers_test_indexed_df, answers_train_indexed_df)

def process_text_using_tfidf(questions): # Preprocessing of text data textfeats = ["questions_title","questions_body"] for cols in textfeats: questions[cols] = questions[cols].astype(str) questions[cols] = questions[cols].astype(str).fillna('') # FILL NA questions[cols] = questions[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently text = questions["questions_title"] + ' ' + questions["questions_body"] vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', lowercase=True, # Convert all uppercase to lowercase stop_words='english', # Remove commonly found english words ('it', 'a', 'the') which do not typically contain much signal max_df = 0.9, # Only consider words that appear in fewer than max_df percent of all documents # max_features=5000 # Maximum features to be extracted ) question_ids = questions['questions_id'].tolist() tfidf_matrix = vectorizer.fit_transform(text) tfidf_feature_names = vectorizer.get_feature_names() tfidf_matrix return (tfidf_matrix, tfidf_feature_names, question_ids)

def bert_setup(): # Install bert-as-service !pip install bert-serving-server !pip install bert-serving-client # Download and unzip the pre-trained model !wget http://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip !unzip uncased_L-12_H-768_A-12.zip # Start the BERT server bert_command = 'bert-serving-start -model_dir /kaggle/working/uncased_L-12_H-768_A-12' process = subprocess.Popen(bert_command.split(), stdout=subprocess.PIPE) def combine_two_list(x,y): z = [] for i in range(len(x)): z.append(x[i]+y[i]) return z def process_text_using_bert_embeddings(questions): title_embeddings = bc.encode(questions["questions_title"].tolist()) body_embeddings = bc.encode(questions["questions_body"].tolist()) question_embeddings = np.asarray(combine_two_list(title_embeddings.tolist(),body_embeddings.tolist())) return scipy.sparse.csr_matrix(question_embeddings)

def get_question_profile(question_id, processed_text_matrix): idx = question_ids.index(question_id) question_profile = processed_text_matrix[idx:idx+1] return question_profile def get_question_profiles(ids, processed_text_matrix): question_profiles_list = [get_question_profile(x, processed_text_matrix) for x in np.ravel([ids])] question_profiles = scipy.sparse.vstack(question_profiles_list) return question_profiles def build_professionals_profile(professional_id, answers_indexed_df, processed_text_matrix): answers_professional_df = answers_indexed_df.loc[professional_id] professional_question_profiles = get_question_profiles(answers_professional_df['questions_id'], processed_text_matrix) professional_question_strengths = np.array(answers_professional_df['eventStrength']).reshape(-1,1) #Weighted average of question profiles by the answers strength professional_question_strengths_weighted_avg = np.sum(professional_question_profiles.multiply(professional_question_strengths), axis=0) / (np.sum(professional_question_strengths)+1) professional_profile_norm = sklearn.preprocessing.normalize(professional_question_strengths_weighted_avg) return professional_profile_norm from tqdm import tqdm def build_professionals_profiles(answers_full_df, processed_text_matrix): answers_indexed_df = answers_full_df[answers_full_df['questions_id'].isin(questions['questions_id'])].set_index('professionals_id') professional_profiles = {} for professional_id in tqdm(answers_indexed_df.index.unique()): professional_profiles[professional_id] = build_professionals_profile(professional_id, answers_indexed_df, processed_text_matrix) print("# of professionals with profiles: %d" % len(professional_profiles)) return professional_profiles

myprofessional1 = "000d4635e5da41e3bfd83677ee11dda4" myprofessional2 = "00271cc10e0245fba4a35e76e669c281" cbr_model_tfidf = ContentBasedRecommender(questions) cbr_model_tfidf.recommend_questions(myprofessional2, tfidf_matrix)