tweet sentiment extraction with spacy

import numpy as np import pandas as pd import seaborn as sb import os for dirname, _, filenames in os.walk('input'): for filename in filenames: print(os.path.join(dirname, filename)) import re from nltk.corpus import stopwords import matplotlib.pyplot as plt from wordcloud import WordCloud from tqdm import tqdm import os import nltk import spacy import random from spacy.util import compounding from spacy.util import minibatch from spacy.training.example import Example import warnings warnings.filterwarnings("ignore") sb.set(style="darkgrid")

BASE_PATH = '/kaggle/input/tweet-sentiment-extraction/' model_name="10iter" train_df = pd.read_csv(BASE_PATH + 'train.csv') test_df = pd.read_csv( BASE_PATH + 'test.csv') submission_df = pd.read_csv( BASE_PATH + 'sample_submission.csv')

def Mcreate_wordcloud_row(texts): # Create a figure with multiple subplots fig, axs = plt.subplots(nrows=1, ncols=len(texts),figsize=(len(texts)*10, len(texts)*10)) i=0; # Generate and draw a word cloud for each text for key in texts: wordcloud = WordCloud( background_color = 'white', max_words = 200, max_font_size = 40, scale = 3, random_state = 42 ).generate(str(texts[key])) axs[i].set(title=str(key)) axs[i].imshow(wordcloud, interpolation='bilinear') axs[i].axis("off") i+=1

def barplot_boxplot_row(df,title,df1,title1): # set a grey background (use sb.set_theme() if seaborn version 0.11.0 or above) sb.set(style="darkgrid") # creating a figure composed of two matplotlib.Axes objects (ax_box and ax_hist) f, (ax_box, ax_hist) = plt.subplots(2,2, sharex=True, figsize=(20,5),gridspec_kw={"height_ratios": (.15, .85)}) # assigning a graph to each ax sb.boxplot(df, ax=ax_box[0]) sb.histplot(data=df, ax=ax_hist[0]) # assigning a graph to each ax sb.boxplot(df1, ax=ax_box[1]) sb.histplot(data=df1, ax=ax_hist[1]) # Remove x axis name for the boxplot ax_box[0].set(xlabel='',title=title) ax_box[1].set(xlabel='',title=title1) plt.show()

train_df = train_df.dropna() plt.figure(figsize=(15,5)) sb.histplot(train_df,x='sentiment',hue="sentiment" ).set(title='Répartition des sentiments dans le dataset ')

## Barplot + histplot neutral_df = train_df.loc[train_df['sentiment']=="neutral"] neutral_text = neutral_df.text neutral_selected_text = neutral_df.selected_text barplot_boxplot_row(neutral_text.str.len(),"Longueur des tweet pour les sentiments Positive",neutral_selected_text.str.len(),"Longueur des tweet pour les sentiments Positive") ## WordCloud wordClouds = { 'Nuage de mot pour les tweet neutre':neutral_text, 'Nuage de mot pour les mot determinant pour les sentiments neutre': neutral_selected_text } Mcreate_wordcloud_row(wordClouds)

neutral_df

## Barplot + histplot df_positive= train_df.loc[train_df['sentiment']=="positive"] positive_text = df_positive.text positive_selected_text = df_positive.selected_text barplot_boxplot_row(positive_text.str.len(),"Longueur des tweet pour les sentiments Positive",positive_selected_text.str.len(),"Longueur des texte determinant pour les sentiments Postive ") ## WordCloud wordClouds = { 'Nuage de mot pour les tweet positive':positive_text, 'Nuage de mot pour les mot determinant pour les sentiments Positive': positive_selected_text } Mcreate_wordcloud_row(wordClouds)

negative_text = train_df.loc[train_df['sentiment']=="negative"].text negative_selected_text = train_df.loc[train_df['sentiment']=="negative"].selected_text barplot_boxplot_row(negative_text.str.len(),"Longueur des tweet pour les sentiments Negative",negative_selected_text.str.len(),"Longueur des texte determinant pour les sentiments Negative") ## WordCloud wordClouds = { 'Nuage de mot pour les tweet negative':negative_text, 'Nuage de mot pour les textes determinant pour les sentiments negative': negative_selected_text } Mcreate_wordcloud_row(wordClouds)

df_positive.loc[train_df['text'].str.len()<10][:10]

train_df['text'] = train_df['text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

import re #del link in text train_df['text'] = train_df['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True) #del @user in text train_df['text'] = train_df['text'].replace(r'@\S+', '', regex=True)

spacy.prefer_gpu()

def save_model(output_dir, nlp, new_model_name): output_dir = f'./output/{output_dir}' if output_dir is not None: if not os.path.exists(output_dir): os.makedirs(output_dir) nlp.meta["name"] = new_model_name nlp.to_disk(output_dir) print("Saved model to", output_dir)

# pass model = nlp if you want to train on top of existing model def train(train_data, output_dir, n_iter=40, model=None): """Load the model, set up the pipeline and train the entity recognizer.""" "" if model is not None: nlp = spacy.load(output_dir) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe('ner', last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in train_data: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch if model is None: nlp.begin_training() else: nlp.resume_training() for itn in tqdm(range(n_iter)): random.shuffle(train_data) batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001)) losses = {} for batch in batches: texts, annotations = zip(*batch) examples=[] for texts, annotations in batch: # create Example doc = nlp.make_doc(texts) examples.append(Example.from_dict(doc, annotations)) nlp.update(examples, # batch of texts# batch of texts # annotations, # batch of annotations drop=0.1, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) save_model(output_dir, nlp, 'st_ner')

def get_model_out_path(sentiment): """Définir le chemin pour chaque model Args: sentiment (string): sentiment du tweet "neutral","positive" ou "negative" Returns: string: path to model """ model_out_path = None if sentiment == 'positive': model_out_path = 'models/model_pos' elif sentiment == 'negative': model_out_path = 'models/model_neg' else: model_out_path = 'models/model_neu' return model_out_path

def get_training_data(sentiment: str) -> dict: """Crée les données au format spacy Args: sentiment (string): sentiment du tweet "neutral","positive" ou "negative" Returns: dict """ train_data = [] for index, row in train_df.iterrows(): if row.sentiment == sentiment: selected_text = row.selected_text text = row.text start = text.find(selected_text) end = start + len(selected_text) train_data.append( (text, {"entities": [[start, end, 'selected_text']]})) return train_data

sentiment = 'positive' train_data = get_training_data(sentiment) model_path = get_model_out_path(sentiment) train(train_data, model_path, n_iter=40, model=None)

sentiment = 'negative' train_data = get_training_data(sentiment) model_path = get_model_out_path(sentiment) train(train_data, model_path, n_iter=40, model=None)

sentiment = 'neutral' train_data = get_training_data(sentiment) model_path = get_model_out_path(sentiment) train(train_data, model_path, n_iter=40, model=None)

#TRAINED_MODELS_BASE_PATH = '/output/models/' # Défini le dossier ou stoquer les models TRAINED_MODELS_BASE_PATH ='/kaggle/working/output/models/'

def predict_entities(text, model,log=False): """Prédit les entités grace à spacy Args: text (str): le texte à traiter model (spacy_model): Le model spacy à utiliser pour le texte log (bool, optional): si on souhaite printer le résultat Returns: str: le texte générer par le model """ doc = model(text) ent_array = [] for ent in doc.ents: start = text.find(ent.text) end = start + len(ent.text) new_int = [start, end, ent.label_] if new_int not in ent_array: ent_array.append([start, end, ent.label_]) selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text if(log): print(new_int) return selected_text

import wandb from kaggle_secrets import UserSecretsClient def jaccard(str1, str2): """ la distance de Jaccard est le rapport de la différence entre l'union d'ensemble et l'intersection d'ensemble sur l'union d'ensemble. Args: str1 (str): texte 1 str2 (str): texte 2 Returns: float: pourcentage d'intersection """ a = set(str1.lower().split()) b = set(str2.lower().split()) c = a.intersection(b) return float(len(c)) / (len(a) + len(b) - len(c)) if TRAINED_MODELS_BASE_PATH is not None: print("Loading Models from ", TRAINED_MODELS_BASE_PATH) model_pos = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_pos') model_neg = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_neg') model_neu = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_neu') jaccard_history={ 'neg':[], 'neu':[], 'pos':[] } jaccard_score = 0 jaccard_df=pd.DataFrame(columns=['idRow','sentiment','jaccard']) #wandb.login(key=UserSecretsClient().get_secret("API_KEY_WANDB")) #wandb.init(project="spacy-tweet") #wandb.run.name="30 iter" for index, row in tqdm(train_df.iterrows(), total=train_df.shape[0]): text = row.text if row.sentiment == 'neutral': jaccard_=jaccard(predict_entities(text, model_neu), row.selected_text) #wandb.log({"jaccard_score": jaccard_,"jaccard_score_model_neu":jaccard_}) jaccard_history['neu'].append(jaccard_) row={'idRow':row.textID,'sentiment':row.sentiment,'jaccard':jaccard_} jaccard_df=jaccard_df.append(row, ignore_index = True) jaccard_score += jaccard_ elif row.sentiment == 'positive': jaccard_=jaccard(predict_entities(text, model_pos), row.selected_text) #wandb.log({"jaccard_score": jaccard_,"jaccard_score_model_pos":jaccard_}) jaccard_history['pos'].append(jaccard_) jaccard_score += jaccard_ row={'idRow':row.textID,'sentiment':row.sentiment,'jaccard':jaccard_} jaccard_df=jaccard_df.append(row, ignore_index = True) else: jaccard_=jaccard(predict_entities(text, model_neg), row.selected_text) #wandb.log({"jaccard_score": jaccard_,"jaccard_score_model_neg": jaccard_}) jaccard_history['neg'].append(jaccard_) jaccard_score += jaccard_ row={'idRow':row.textID,'sentiment':row.sentiment,'jaccard':jaccard_} jaccard_df=jaccard_df.append(row, ignore_index = True) print(f'Average Jaccard Score is {jaccard_score / train_df.shape[0]}')

fig=sb.violinplot(data=jaccard_df, x="jaccard",y="sentiment") folder=f'/kaggle/working/img/{model_name}' if not os.path.exists(f'{folder}'): os.makedirs(folder) plt.savefig(f'{folder}/violin.png')

y = sb.barplot(data=jaccard_df.append( {'sentiment':"all",'jaccard':jaccard_score/train_df.shape[0]},ignore_index=True), x="sentiment",y="jaccard") y.set(xlabel='performance') y.bar_label(y.containers[0]) y.set(title='Performance en fonction des modèles') if not os.path.exists(folder): os.makedirs(folder) plt.savefig(f'{folder}/barplot')

model_neg = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_neg') print(train_df.selected_text[4]) predict_entities(train_df.text[4], model_neg,log=True)

if TRAINED_MODELS_BASE_PATH is not None: print("Loading Models from ", TRAINED_MODELS_BASE_PATH) model_pos = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_pos') model_neg = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_neg') model_neu = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_neu') columns=['textID','selected_text'] submission_df=pd.DataFrame(columns=columns) for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]): selected_text='' text = row.text if row.sentiment == 'neutral': selected_text=predict_entities(text, model_neu) elif row.sentiment == 'positive': selected_text=predict_entities(text, model_pos) else: selected_text=predict_entities(text, model_neg) row={'textID':row.textID,'selected_text':selected_text} submission_df=submission_df.append(row,ignore_index=True) submission_df.to_csv('submission.csv', index=False)