import numpy as np
import pandas as pd
import seaborn as sb
import os
for dirname, _, filenames in os.walk('input'):
for filename in filenames:
print(os.path.join(dirname, filename))
import re
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from tqdm import tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch
from spacy.training.example import Example
import warnings
warnings.filterwarnings("ignore")
sb.set(style="darkgrid")
BASE_PATH = '/kaggle/input/tweet-sentiment-extraction/'
model_name="10iter"
train_df = pd.read_csv(BASE_PATH + 'train.csv')
test_df = pd.read_csv( BASE_PATH + 'test.csv')
submission_df = pd.read_csv( BASE_PATH + 'sample_submission.csv')
def Mcreate_wordcloud_row(texts):
# Create a figure with multiple subplots
fig, axs = plt.subplots(nrows=1, ncols=len(texts),figsize=(len(texts)*10, len(texts)*10))
i=0;
# Generate and draw a word cloud for each text
for key in texts:
wordcloud = WordCloud(
background_color = 'white',
max_words = 200,
max_font_size = 40,
scale = 3,
random_state = 42
).generate(str(texts[key]))
axs[i].set(title=str(key))
axs[i].imshow(wordcloud, interpolation='bilinear')
axs[i].axis("off")
i+=1
def barplot_boxplot_row(df,title,df1,title1):
# set a grey background (use sb.set_theme() if seaborn version 0.11.0 or above)
sb.set(style="darkgrid")
# creating a figure composed of two matplotlib.Axes objects (ax_box and ax_hist)
f, (ax_box, ax_hist) = plt.subplots(2,2, sharex=True, figsize=(20,5),gridspec_kw={"height_ratios": (.15, .85)})
# assigning a graph to each ax
sb.boxplot(df, ax=ax_box[0])
sb.histplot(data=df, ax=ax_hist[0])
# assigning a graph to each ax
sb.boxplot(df1, ax=ax_box[1])
sb.histplot(data=df1, ax=ax_hist[1])
# Remove x axis name for the boxplot
ax_box[0].set(xlabel='',title=title)
ax_box[1].set(xlabel='',title=title1)
plt.show()
train_df = train_df.dropna()
plt.figure(figsize=(15,5))
sb.histplot(train_df,x='sentiment',hue="sentiment" ).set(title='Répartition des sentiments dans le dataset ')
## Barplot + histplot
neutral_df = train_df.loc[train_df['sentiment']=="neutral"]
neutral_text = neutral_df.text
neutral_selected_text = neutral_df.selected_text
barplot_boxplot_row(neutral_text.str.len(),"Longueur des tweet pour les sentiments Positive",neutral_selected_text.str.len(),"Longueur des tweet pour les sentiments Positive")
## WordCloud
wordClouds = {
'Nuage de mot pour les tweet neutre':neutral_text,
'Nuage de mot pour les mot determinant pour les sentiments neutre': neutral_selected_text
}
Mcreate_wordcloud_row(wordClouds)
neutral_df
## Barplot + histplot
df_positive= train_df.loc[train_df['sentiment']=="positive"]
positive_text = df_positive.text
positive_selected_text = df_positive.selected_text
barplot_boxplot_row(positive_text.str.len(),"Longueur des tweet pour les sentiments Positive",positive_selected_text.str.len(),"Longueur des texte determinant pour les sentiments Postive ")
## WordCloud
wordClouds = { 'Nuage de mot pour les tweet positive':positive_text, 'Nuage de mot pour les mot determinant pour les sentiments Positive': positive_selected_text }
Mcreate_wordcloud_row(wordClouds)
negative_text = train_df.loc[train_df['sentiment']=="negative"].text
negative_selected_text = train_df.loc[train_df['sentiment']=="negative"].selected_text
barplot_boxplot_row(negative_text.str.len(),"Longueur des tweet pour les sentiments Negative",negative_selected_text.str.len(),"Longueur des texte determinant pour les sentiments Negative")
## WordCloud
wordClouds = {
'Nuage de mot pour les tweet negative':negative_text,
'Nuage de mot pour les textes determinant pour les sentiments negative': negative_selected_text
}
Mcreate_wordcloud_row(wordClouds)
df_positive.loc[train_df['text'].str.len()<10][:10]
train_df['text'] = train_df['text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
import re
#del link in text
train_df['text'] = train_df['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
#del @user in text
train_df['text'] = train_df['text'].replace(r'@\S+', '', regex=True)
spacy.prefer_gpu()
def save_model(output_dir, nlp, new_model_name):
output_dir = f'./output/{output_dir}'
if output_dir is not None:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
nlp.meta["name"] = new_model_name
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# pass model = nlp if you want to train on top of existing model
def train(train_data, output_dir, n_iter=40, model=None):
"""Load the model, set up the pipeline and train the entity recognizer."""
""
if model is not None:
nlp = spacy.load(output_dir) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe("ner")
nlp.add_pipe('ner', last=True)
# otherwise, get it so we can add labels
else:
ner = nlp.get_pipe("ner")
# add labels
for _, annotations in train_data:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): # only train NER
# sizes = compounding(1.0, 4.0, 1.001)
# batch up the examples using spaCy's minibatch
if model is None:
nlp.begin_training()
else:
nlp.resume_training()
for itn in tqdm(range(n_iter)):
random.shuffle(train_data)
batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))
losses = {}
for batch in batches:
texts, annotations = zip(*batch)
examples=[]
for texts, annotations in batch:
# create Example
doc = nlp.make_doc(texts)
examples.append(Example.from_dict(doc, annotations))
nlp.update(examples, # batch of texts# batch of texts
# annotations, # batch of annotations
drop=0.1, # dropout - make it harder to memorise data
losses=losses, )
print("Losses", losses)
save_model(output_dir, nlp, 'st_ner')
def get_model_out_path(sentiment):
"""Définir le chemin pour chaque model
Args:
sentiment (string): sentiment du tweet "neutral","positive" ou "negative"
Returns:
string: path to model
"""
model_out_path = None
if sentiment == 'positive':
model_out_path = 'models/model_pos'
elif sentiment == 'negative':
model_out_path = 'models/model_neg'
else:
model_out_path = 'models/model_neu'
return model_out_path
def get_training_data(sentiment: str) -> dict:
"""Crée les données au format spacy
Args:
sentiment (string): sentiment du tweet "neutral","positive" ou "negative"
Returns:
dict
"""
train_data = []
for index, row in train_df.iterrows():
if row.sentiment == sentiment:
selected_text = row.selected_text
text = row.text
start = text.find(selected_text)
end = start + len(selected_text)
train_data.append(
(text, {"entities": [[start, end, 'selected_text']]}))
return train_data
sentiment = 'positive'
train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
train(train_data, model_path, n_iter=40, model=None)
sentiment = 'negative'
train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
train(train_data, model_path, n_iter=40, model=None)
sentiment = 'neutral'
train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
train(train_data, model_path, n_iter=40, model=None)
#TRAINED_MODELS_BASE_PATH = '/output/models/' # Défini le dossier ou stoquer les models
TRAINED_MODELS_BASE_PATH ='/kaggle/working/output/models/'
def predict_entities(text, model,log=False):
"""Prédit les entités grace à spacy
Args:
text (str): le texte à traiter
model (spacy_model): Le model spacy à utiliser pour le texte
log (bool, optional): si on souhaite printer le résultat
Returns:
str: le texte générer par le model
"""
doc = model(text)
ent_array = []
for ent in doc.ents:
start = text.find(ent.text)
end = start + len(ent.text)
new_int = [start, end, ent.label_]
if new_int not in ent_array:
ent_array.append([start, end, ent.label_])
selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
if(log):
print(new_int)
return selected_text
import wandb
from kaggle_secrets import UserSecretsClient
def jaccard(str1, str2):
""" la distance de Jaccard est le rapport de la différence
entre l'union d'ensemble et l'intersection d'ensemble sur l'union d'ensemble.
Args:
str1 (str): texte 1
str2 (str): texte 2
Returns:
float: pourcentage d'intersection
"""
a = set(str1.lower().split())
b = set(str2.lower().split())
c = a.intersection(b)
return float(len(c)) / (len(a) + len(b) - len(c))
if TRAINED_MODELS_BASE_PATH is not None:
print("Loading Models from ", TRAINED_MODELS_BASE_PATH)
model_pos = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_pos')
model_neg = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_neg')
model_neu = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_neu')
jaccard_history={
'neg':[],
'neu':[],
'pos':[]
}
jaccard_score = 0
jaccard_df=pd.DataFrame(columns=['idRow','sentiment','jaccard'])
#wandb.login(key=UserSecretsClient().get_secret("API_KEY_WANDB"))
#wandb.init(project="spacy-tweet")
#wandb.run.name="30 iter"
for index, row in tqdm(train_df.iterrows(), total=train_df.shape[0]):
text = row.text
if row.sentiment == 'neutral':
jaccard_=jaccard(predict_entities(text, model_neu), row.selected_text)
#wandb.log({"jaccard_score": jaccard_,"jaccard_score_model_neu":jaccard_})
jaccard_history['neu'].append(jaccard_)
row={'idRow':row.textID,'sentiment':row.sentiment,'jaccard':jaccard_}
jaccard_df=jaccard_df.append(row, ignore_index = True)
jaccard_score += jaccard_
elif row.sentiment == 'positive':
jaccard_=jaccard(predict_entities(text, model_pos), row.selected_text)
#wandb.log({"jaccard_score": jaccard_,"jaccard_score_model_pos":jaccard_})
jaccard_history['pos'].append(jaccard_)
jaccard_score += jaccard_
row={'idRow':row.textID,'sentiment':row.sentiment,'jaccard':jaccard_}
jaccard_df=jaccard_df.append(row, ignore_index = True)
else:
jaccard_=jaccard(predict_entities(text, model_neg), row.selected_text)
#wandb.log({"jaccard_score": jaccard_,"jaccard_score_model_neg": jaccard_})
jaccard_history['neg'].append(jaccard_)
jaccard_score += jaccard_
row={'idRow':row.textID,'sentiment':row.sentiment,'jaccard':jaccard_}
jaccard_df=jaccard_df.append(row, ignore_index = True)
print(f'Average Jaccard Score is {jaccard_score / train_df.shape[0]}')
fig=sb.violinplot(data=jaccard_df, x="jaccard",y="sentiment")
folder=f'/kaggle/working/img/{model_name}'
if not os.path.exists(f'{folder}'):
os.makedirs(folder)
plt.savefig(f'{folder}/violin.png')
y = sb.barplot(data=jaccard_df.append(
{'sentiment':"all",'jaccard':jaccard_score/train_df.shape[0]},ignore_index=True),
x="sentiment",y="jaccard")
y.set(xlabel='performance')
y.bar_label(y.containers[0])
y.set(title='Performance en fonction des modèles')
if not os.path.exists(folder):
os.makedirs(folder)
plt.savefig(f'{folder}/barplot')
model_neg = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_neg')
print(train_df.selected_text[4])
predict_entities(train_df.text[4], model_neg,log=True)
if TRAINED_MODELS_BASE_PATH is not None:
print("Loading Models from ", TRAINED_MODELS_BASE_PATH)
model_pos = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_pos')
model_neg = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_neg')
model_neu = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_neu')
columns=['textID','selected_text']
submission_df=pd.DataFrame(columns=columns)
for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
selected_text=''
text = row.text
if row.sentiment == 'neutral':
selected_text=predict_entities(text, model_neu)
elif row.sentiment == 'positive':
selected_text=predict_entities(text, model_pos)
else:
selected_text=predict_entities(text, model_neg)
row={'textID':row.textID,'selected_text':selected_text}
submission_df=submission_df.append(row,ignore_index=True)
submission_df.to_csv('submission.csv', index=False)