# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
# (e.g. pd.read_csv)
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from subprocess import check_output
print(check_output(["ls", "./input"]).decode("utf8"))
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")
model_v="lstm3-32"
train_df.describe()
import seaborn as sb
rep=train_df.loc[ : , train_df.columns!='comment_text' ]
taille=len(rep)
di={
'toxic':rep.loc[rep['toxic']==1].toxic.count()/taille,
'serve_toxic':rep.loc[rep['severe_toxic']==1].severe_toxic.count()/taille,
'obscene':rep.loc[rep['obscene']==1].obscene.count()/taille,
'threat':rep.loc[rep['threat']==1].threat.count()/taille,
'insult':rep.loc[rep['insult']==1].insult.count()/taille,
'identity_hate':rep.loc[rep['identity_hate']==1].identity_hate.count()/taille
}
keys = list(di.keys())
values = list(di.values())
di
sb.barplot(y=values,x=keys).set(title='Répartition en pourcentage des raisons abusifs')
plt.tight_layout()
def barplot_boxplot_row(df,titl):
# set a grey background (use sb.set_theme() if seaborn version 0.11.0 or above)
sb.set(style="darkgrid")
# creating a figure composed of two matplotlib.Axes objects (ax_box and ax_hist)
f, (ax_box, ax_hist) = plt.subplots(1,1, sharex=True, figsize=(20,5),gridspec_kw={"height_ratios": (.15, .85)})
# assigning a graph to each ax
sb.boxplot(df, ax=ax_box[0])
sb.histplot(data=df, ax=ax_hist[0])
plt.show()
test_df.sample(5)
train_df['comment_text'].fillna("unknown", inplace=True)
test_df['comment_text'].fillna("unknown", inplace=True)
train_text_data = train_df['comment_text']
test_text_data = test_df['comment_text']
train_y = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
con_data = pd.concat([train_text_data,test_text_data])
max_features=100000
maxlen=150
embed_size=300
tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list(con_data))
train_x = tokenizer.texts_to_sequences(train_text_data)
test_x = tokenizer.texts_to_sequences(test_text_data)
train_x_pad = sequence.pad_sequences(train_x, maxlen=maxlen)
test_x_pad = sequence.pad_sequences(test_x, maxlen=maxlen)
len(train_x_pad)
len(train_x_pad[2])
word_index = tokenizer.word_index
len(word_index)
EMBEDDING_FILE = './input/glove.840B.300d.txt'
embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
for line in f:
values = line.rstrip().rsplit(' ')
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
word_index = tokenizer.word_index
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
if i >= max_features:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
import wandb
wandb.init(project="toxic classification")
wandb.run.name=f"model_{model_v}"
class RocAucEvaluation(Callback):
def __init__(self, validation_data=(), interval=1):
super(Callback, self).__init__()
self.interval = interval
self.X_val, self.y_val = validation_data
def on_epoch_end(self, epoch, logs={}):
if epoch % self.interval == 0:
y_pred = self.model.predict(self.X_val, verbose=0)
score = roc_auc_score(self.y_val, y_pred)
wandb.log({'y_pred':y_pred,'score':score})
print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
x = SpatialDropout1D(0.10)(x)
x = Bidirectional(LSTM(32, return_sequences=True, dropout=0.05 , recurrent_dropout=0.10))(x)
x = Bidirectional(LSTM(32, return_sequences=True, dropout=0.05, recurrent_dropout=0.10))(x)
x = Bidirectional(LSTM(32, return_sequences=True, dropout=0.05, recurrent_dropout=0.10))(x)
x = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(x)
x = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
out = Dense(6, activation='sigmoid')(x)
model = Model(inp, out)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
batch_size = 128
epochs = 4
X_tra, X_val, y_tra, y_val = train_test_split(train_x_pad, train_y, train_size=0.9, random_state=233)
from wandb.keras import WandbCallback
# filepath="../input/best-model/best.hdf5"
filepath=f"./models/weights_{model_v}.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
#early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
callbacks_list = [ra_val,checkpoint,WandbCallback()]
model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks = callbacks_list,verbose=1)
#Loading model weights
model.save_weights(filepath)
print('Predicting....')
predictions = model.predict(test_x_pad,batch_size=1024,verbose=1)
#model.fit(train_x_pad, train_y, batch_size=batch_size, epochs=epochs, verbose=1)
#predictions = model.predict(test_x_pad, batch_size=batch_size, verbose=1)
model=Model(inp,out)
model.load_weights(filepath)
model.compile()
predictions = model.predict(test_x_pad,batch_size=1024,verbose=1)
submission = pd.read_csv('./input/sample_submission.csv')
submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = predictions
submission.to_csv('submission.csv', index=False)
submission.head()