LitCoin NLP Challenge

# loading libraries import pandas as pd import numpy as np import spacy as sp

# loading data folder = 'C:/Users/saaye/OneDrive/Documents/Machine Learning Projects/LitCoin NLP Challenge/data/' abstract_test = pd.read_csv(folder + 'abstracts_test.csv', sep='\t') abstract_train = pd.read_csv(folder + 'abstracts_train.csv', sep='\t') entities_train = pd.read_csv(folder + 'entities_train.csv', sep='\t') relations_train = pd.read_csv(folder + 'relations_train.csv', sep='\t') submission_example = pd.read_csv(folder + 'submission_example.csv', sep='\t')

# printing dimensions of all datasets print(f'Abstract training dataset has {abstract_train.shape[0]} columns and {abstract_train.shape[1]} rows.') print(f'Relations training dataset has {relations_train.shape[0]} columns and {relations_train.shape[1]} rows.') print(f'Entities training dataset has {entities_train.shape[0]} columns and {entities_train.shape[1]} rows.')

# printing an example of the abstract database print(f'An example of an observation is as follows: \n # abstract_id: PubMed ID of the research paper. : {abstract_train["abstract_id"][0]}') print(f' # title: title of the research paper. : {abstract_train["title"][0]}') print(f' # abstract: abstract or summary of the research paper. : {abstract_train["abstract"][0]}')

# input string considered for these indices is the concatenation of the title and abstract strings df = pd.DataFrame(abstract_train['title'] + ' ' + abstract_train['abstract'], columns=['text'])

# custom entity recognition using spacy nlp = sp.blank("en") # creates a blank pipeline ruler = nlp.add_pipe("entity_ruler") # adds entity ruling component to the pipeline patterns = entities_train[["type", "mention"]].rename(columns={"type": "label", "mention": "pattern"}).to_dict("records") # a dictionary of the patterns ruler.add_patterns(patterns) # adds patterns to the pipeline texts = df['text'].tolist() answers = [] for doc in nlp.pipe(texts, n_process=4, batch_size=2000): answers.append([(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents])

# create a named entity visualizer type = entities_train["type"].unique().tolist() # keys for color value color = ["#5dd8d2", "#9d34f1", "#444c63", "#ec0639", "#57d921", "#fe2a9f"] # color values colors = {type[i]: color[i] for i in range(len(type))} # creates dictionary of the color keys and values options = {"ents": type, "colors": colors} # assigns the colors to entity sp.displacy.render(doc, style="ent", jupyter=True, options=options) # renders the entities

abstract_test["full"] = abstract_test["title"] + ' ' + abstract_test["abstract"]