# loading libraries
import pandas as pd
import numpy as np
import spacy as sp
# loading data
folder = 'C:/Users/saaye/OneDrive/Documents/Machine Learning Projects/LitCoin NLP Challenge/data/'
abstract_test = pd.read_csv(folder + 'abstracts_test.csv', sep='\t')
abstract_train = pd.read_csv(folder + 'abstracts_train.csv', sep='\t')
entities_train = pd.read_csv(folder + 'entities_train.csv', sep='\t')
relations_train = pd.read_csv(folder + 'relations_train.csv', sep='\t')
submission_example = pd.read_csv(folder + 'submission_example.csv', sep='\t')
# printing dimensions of all datasets
print(f'Abstract training dataset has {abstract_train.shape[0]} columns and {abstract_train.shape[1]} rows.')
print(f'Relations training dataset has {relations_train.shape[0]} columns and {relations_train.shape[1]} rows.')
print(f'Entities training dataset has {entities_train.shape[0]} columns and {entities_train.shape[1]} rows.')
# printing an example of the abstract database
print(f'An example of an observation is as follows: \n # abstract_id: PubMed ID of the research paper. : {abstract_train["abstract_id"][0]}')
print(f' # title: title of the research paper. : {abstract_train["title"][0]}')
print(f' # abstract: abstract or summary of the research paper. : {abstract_train["abstract"][0]}')
# input string considered for these indices is the concatenation of the title and abstract strings
df = pd.DataFrame(abstract_train['title'] + ' ' + abstract_train['abstract'], columns=['text'])
# custom entity recognition using spacy
nlp = sp.blank("en") # creates a blank pipeline
ruler = nlp.add_pipe("entity_ruler") # adds entity ruling component to the pipeline
patterns = entities_train[["type", "mention"]].rename(columns={"type": "label", "mention": "pattern"}).to_dict("records") # a dictionary of the patterns
ruler.add_patterns(patterns) # adds patterns to the pipeline
texts = df['text'].tolist()
answers = []
for doc in nlp.pipe(texts, n_process=4, batch_size=2000):
answers.append([(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents])
# create a named entity visualizer
type = entities_train["type"].unique().tolist() # keys for color value
color = ["#5dd8d2", "#9d34f1", "#444c63", "#ec0639", "#57d921", "#fe2a9f"] # color values
colors = {type[i]: color[i] for i in range(len(type))} # creates dictionary of the color keys and values
options = {"ents": type, "colors": colors} # assigns the colors to entity
sp.displacy.render(doc, style="ent", jupyter=True, options=options) # renders the entities
abstract_test["full"] = abstract_test["title"] + ' ' + abstract_test["abstract"]