# loading libraries
import pandas as pd
import numpy as np
import spacy as sp
# loading data
folder = 'C:/Users/saaye/OneDrive/Documents/Machine Learning Projects/LitCoin NLP Challenge/data/'
abstract_test = pd.read_csv(folder + 'abstracts_test.csv', sep='\t')
abstract_train = pd.read_csv(folder + 'abstracts_train.csv', sep='\t')
entities_train = pd.read_csv(folder + 'entities_train.csv', sep='\t')
relations_train = pd.read_csv(folder + 'relations_train.csv', sep='\t')
submission_example = pd.read_csv(folder + 'submission_example.csv', sep='\t')
# printing dimensions of all datasets
print(f'Abstract training dataset has {abstract_train.shape[0]} columns and {abstract_train.shape[1]} rows.')
print(f'Relations training dataset has {relations_train.shape[0]} columns and {relations_train.shape[1]} rows.')
print(f'Entities training dataset has {entities_train.shape[0]} columns and {entities_train.shape[1]} rows.')
Abstract training dataset has 400 columns and 3 rows.
Relations training dataset has 4280 columns and 6 rows.
Entities training dataset has 13636 columns and 7 rows.
# printing an example of the abstract database
print(f'An example of an observation is as follows: \n # abstract_id: PubMed ID of the research paper. : {abstract_train["abstract_id"][0]}')
print(f' # title: title of the research paper. : {abstract_train["title"][0]}')
print(f' # abstract: abstract or summary of the research paper. : {abstract_train["abstract"][0]}')
An example of an observation is as follows:
# abstract_id: PubMed ID of the research paper. : 1353340
# title: title of the research paper. : Late-onset metachromatic leukodystrophy: molecular pathology in two siblings.
# abstract: abstract or summary of the research paper. : We report on a new allele at the arylsulfatase A (ARSA) locus causing late-onset metachromatic leukodystrophy (MLD). In that allele arginine84, a residue that is highly conserved in the arylsulfatase gene family, is replaced by glutamine. In contrast to alleles that cause early-onset MLD, the arginine84 to glutamine substitution is associated with some residual ARSA activity. A comparison of genotypes, ARSA activities, and clinical data on 4 individuals carrying the allele of 81 patients with MLD examined, further validates the concept that different degrees of residual ARSA activity are the basis of phenotypical variation in MLD..
# input string considered for these indices is the concatenation of the title and abstract strings
df = pd.DataFrame(abstract_train['title'] + ' ' + abstract_train['abstract'], columns=['text'])
# custom entity recognition using spacy
nlp = sp.blank("en") # creates a blank pipeline
ruler = nlp.add_pipe("entity_ruler") # adds entity ruling component to the pipeline
patterns = entities_train[["type", "mention"]].rename(columns={"type": "label", "mention": "pattern"}).to_dict("records") # a dictionary of the patterns
ruler.add_patterns(patterns) # adds patterns to the pipeline
texts = df['text'].tolist()
answers = []
for doc in nlp.pipe(texts, n_process=4, batch_size=2000):
answers.append([(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents])
# create a named entity visualizer
type = entities_train["type"].unique().tolist() # keys for color value
color = ["#5dd8d2", "#9d34f1", "#444c63", "#ec0639", "#57d921", "#fe2a9f"] # color values
colors = {type[i]: color[i] for i in range(len(type))} # creates dictionary of the color keys and values
options = {"ents": type, "colors": colors} # assigns the colors to entity
sp.displacy.render(doc, style="ent", jupyter=True, options=options) # renders the entities
abstract_test["full"] = abstract_test["title"] + ' ' + abstract_test["abstract"]