bdsk = pd.read_csv('/work/data_merged/bdsk.csv')
mlss = pd.read_csv('/work/data_merged/mlss.csv')
ktss = pd.read_csv('/work/data_merged/ktss.csv')
print(
"Formes des dataframes avant traitement\n"
"Big Data Social Analysis Knowledge :",bdsk.shape,
"Machine Learning Social Sciences :",mlss.shape,
"Knowledge Transfer Social Sciences :",ktss.shape,sep='\n')
Formes des dataframes avant traitement
Big Data Social Analysis Knowledge :
(2000, 67)
Machine Learning Social Sciences :
(2767, 67)
Knowledge Transfer Social Sciences :
(2380, 67)
fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(18,5))
sns.heatmap(bdsk.isna(), cbar=False,ax=ax3).set(xticklabels=[])
ax3.set_title("Big Data Social Analysis Knowledge")
sns.heatmap(ktss.isna(), cbar=False,ax=ax2).set(xticklabels=[])
ax2.set_title("Knowledge Transfer Social Sciences")
sns.heatmap(mlss.isna(), cbar=False,ax=ax1).set(xticklabels=[])
ax1.set_title("Machine Learning Social Sciences")
plt.show()
print('Colonnes avec le moins de valeurs nulles (en pourcentage) :', round(bdsk.isnull().mean().nsmallest(8)*100,2),'\n',sep='\n')
Colonnes avec le moins de valeurs nulles (en pourcentage) :
Publication Type 0.00
Authors 0.00
Author Full Names 0.00
Article Title 0.00
Source Title 0.00
UT (Unique WOS ID) 0.00
Abstract 0.25
Publication Year 2.25
dtype: float64
cleaning_list = list(bdsk.isnull().mean().nsmallest(8).index) # On crée une liste avec les colonnes que l'on veut garder
# On filtre les dfs avec cette liste
bdsk_clean = bdsk[cleaning_list]
mlss_clean = mlss[cleaning_list]
ktss_clean = ktss[cleaning_list]
# Export des dfs en csv
bdsk_clean.to_csv('/work/clean_data/bdsk_clean.csv',index=False)
mlss_clean.to_csv('/work/clean_data/mlss_clean.csv',index=False)
ktss_clean.to_csv('/work/clean_data/ktss_clean.csv',index=False)
# Vérification
bdsk_clean.head()
## REPARTITION ARTICLES/SUJETS DE RECHERCHE
bdsk_clean['Research Term'] = "Big Data Social Analysis Knowledge"
ktss_clean['Research Term'] = "Knowledge Transfer Social Sciences"
mlss_clean['Research Term'] = "Machine Learning Social Sciences"
# JOIN DF
dfs = [bdsk_clean,ktss_clean,mlss_clean]
df = pd.concat(dfs)
# GROUP RESEARCH TERM AND DELETE DUPLICATE
a,b = ['Knowledge Transfer Social Sciences, Knowledge Transfer Social Sciences','Machine Learning Social Sciences, Machine Learning Social Sciences',
'Big Data Social Analysis Knowledge, Big Data Social Analysis Knowledge'],['Knowledge Transfer Social Sciences','Machine Learning Social Sciences',
'Big Data Social Analysis Knowledge']
c,d = ['Big Data Social Analysis Knowledge, Machine Learning Social Sciences','Knowledge Transfer Social Sciences, Machine Learning Social Sciences',
'Big Data Social Analysis Knowledge, Knowledge Transfer Social Sciences','Big Data Social Analysis Knowledge, Knowledge Transfer Social Sciences, Machine Learning Social Sciences'],['BDSK, MLSS',
'KTSS, MLSS', 'BDSK, KTSS', 'BDSK, KTSS, MLSS']
df_clean = df.groupby(cleaning_list)['Research Term'].apply(', '.join).reset_index()
df_clean['Research Term'] = df_clean['Research Term'].replace(a,b).replace(c,d)
df_clean.shape
df_clean['Research Term'].replace(a,b).replace(c,d).value_counts()
e,f = ['J','S','B'],['Journal','Serie','Book']
ax = bdsk_clean['Publication Type'].replace(e,f).value_counts().plot.bar(figsize=(5,5),position=-0.5,width=0.1)
mlss_clean['Publication Type'].replace(e,f).value_counts().plot.bar(position=0.5,width=0.1,color='green')
ktss_clean['Publication Type'].replace(e,f).value_counts().plot.bar(position=1.5,width=0.1,color='orange')
ax.legend(["Big Data Social Analysis Knowledge","Knowledge Transfer Social Sciences","Machine Learning Social Sciences"])
plt.subplots(figsize=(15,5))
bdsk_clean[bdsk_clean['Publication Year'] != 2021]['Publication Year'].value_counts().sort_index(ascending=True).plot()
ktss_clean[ktss_clean['Publication Year'] != 2021]['Publication Year'].value_counts().sort_index(ascending=True).plot()
mlss_clean[mlss_clean['Publication Year'] != 2021]['Publication Year'].value_counts().sort_index(ascending=True).plot()
plt.legend(["Big Data Social Analysis Knowledge","Knowledge Transfer Social Sciences","Machine Learning Social Sciences"])
plt.title("Production par année, par sujet de recherche")
df_clean[~df_clean['Research Term'].isin(b)]['Research Term'].value_counts().plot(kind='bar',color=['C0', 'C1', 'C2', 'C3'],
figsize=(10,5),xlabel='',title="Nombre d'articles présents dans plusieurs recherches")
## NLP HERE
# Selection des descriptions et pre-processing en utilisant gensim
phrases = df_clean.Abstract
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
phrases_traitees = list(sent_to_words(phrases))
print(phrases_traitees[:2])
[['phishing', 'is', 'one', 'of', 'the', 'most', 'common', 'attacks', 'on', 'the', 'internet', 'that', 'employs', 'social', 'engineering', 'techniques', 'like', 'deceiving', 'user', 'with', 'forged', 'websites', 'in', 'an', 'attempt', 'to', 'gain', 'sensitive', 'information', 'such', 'as', 'credentials', 'and', 'credit', 'card', 'details', 'this', 'information', 'can', 'be', 'misused', 'resulting', 'in', 'large', 'financial', 'losses', 'to', 'these', 'users', 'phishing', 'detection', 'algorithms', 'can', 'be', 'an', 'effective', 'approach', 'to', 'safeguarding', 'users', 'from', 'such', 'attacks', 'this', 'paper', 'will', 'review', 'different', 'phishing', 'detection', 'approaches', 'which', 'include', 'content', 'based', 'heuristic', 'based', 'and', 'fuzzy', 'rule', 'based', 'approaches'], ['online', 'micro', 'blogging', 'social', 'media', 'websites', 'like', 'twitter', 'are', 'being', 'used', 'as', 'real', 'time', 'platform', 'for', 'information', 'sharing', 'and', 'communication', 'during', 'planning', 'and', 'mobilization', 'of', 'civil', 'unrest', 'events', 'we', 'conduct', 'study', 'of', 'more', 'than', 'million', 'english', 'tweets', 'spanning', 'months', 'on', 'the', 'topic', 'of', 'immigration', 'and', 'found', 'evidences', 'of', 'twitter', 'being', 'used', 'as', 'platform', 'for', 'planning', 'and', 'mobilization', 'of', 'protests', 'and', 'civil', 'disobedience', 'related', 'demonstrations', 'we', 'believe', 'that', 'twitter', 'data', 'can', 'be', 'used', 'as', 'surrogate', 'and', 'open', 'source', 'precursor', 'for', 'forecasting', 'civil', 'unrest', 'and', 'investigate', 'machine', 'learning', 'based', 'techniques', 'for', 'building', 'prediction', 'model', 'we', 'present', 'our', 'solution', 'approach', 'consisting', 'of', 'various', 'components', 'such', 'as', 'named', 'entity', 'recognition', 'temporal', 'spatial', 'location', 'people', 'expressions', 'extraction', 'semantic', 'enrichment', 'of', 'events', 'related', 'tweets', 'crowd', 'buzz', 'commentary', 'and', 'mobilization', 'planning', 'location', 'time', 'topic', 'correlation', 'miner', 'we', 'conduct', 'series', 'of', 'experiments', 'on', 'real', 'world', 'and', 'large', 'dataset', 'and', 'investigate', 'the', 'application', 'of', 'trend', 'analysis', 'we', 'conduct', 'two', 'case', 'studies', 'on', 'civil', 'unrest', 'related', 'events', 'and', 'demonstrate', 'the', 'effectiveness', 'of', 'our', 'approach']]
# Lemmatization
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
return texts_out
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
phrases_lemmatized = lemmatization(phrases_traitees, allowed_postags=['NOUN', 'VERB'])
print(phrases_lemmatized[:2])
['phishing be attack internet employ engineering technique deceive user forge website attempt gain information credential credit card detail information be misuse result loss user phishe detection algorithm be approach safeguard user attack paper review phishe detection approach include content base base rule base approach', 'micro blogge medium website twitter be be use time platform information sharing communication planning mobilization unrest event conduct study tweet span month topic immigration find evidence twitter be use platform planning mobilization protest disobedience relate demonstration believe twitter datum be use source precursor forecast unrest investigate machine learn base technique build prediction model present solution approach consist component name entity recognition location people expression extraction enrichment event relate tweet crowd commentary mobilization planning location time topic correlation miner conduct series experiment world dataset investigate application trend analysis conduct case study unrest event demonstrate effectiveness approach']
# Vectorisation du dictionnaire de descriptions
vectorizer = CountVectorizer(analyzer='word',min_df=10,stop_words='english',lowercase=True,token_pattern='[a-zA-Z0-9]{3,}',max_features=50000)
phrases_vectorized = vectorizer.fit_transform(phrases_lemmatized)
# Chargement du modèle LDA et fit + scores
lda_model = LatentDirichletAllocation(n_components=20,max_iter=10,learning_method='online',random_state=100,batch_size=128,evaluate_every = -1)
lda_output = lda_model.fit_transform(phrases_vectorized)
print(lda_model)
print("Log Likelihood: ", lda_model.score(phrases_vectorized))
print("Perplexity: ", lda_model.perplexity(phrases_vectorized))
pprint(lda_model.get_params())
LatentDirichletAllocation(learning_method='online', n_components=20,
random_state=100)
Log Likelihood: -4105598.808790911
Perplexity: 847.2215567636372
{'batch_size': 128,
'doc_topic_prior': None,
'evaluate_every': -1,
'learning_decay': 0.7,
'learning_method': 'online',
'learning_offset': 10.0,
'max_doc_update_iter': 100,
'max_iter': 10,
'mean_change_tol': 0.001,
'n_components': 20,
'n_jobs': None,
'perp_tol': 0.1,
'random_state': 100,
'topic_word_prior': None,
'total_samples': 1000000.0,
'verbose': 0}
# Définition des paramètres du GridSearchCV
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
# Chargement du modèle LDA et GridSearchCV
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
model = GridSearchCV(lda, param_grid=search_params)
# Fit
model.fit(phrases_vectorized)
# Estimation du meilleur modèle
best_lda_model = model.best_estimator_
# Parametres du modèle
print("Paramètres du modèle gardé: ", model.best_params_)
# Log Likelihood
print("Meilleur score Log likelihood: ", model.best_score_)
# Perplexité
print("Perplexité du modèle: ", best_lda_model.perplexity(phrases_vectorized))
Paramètres du modèle gardé: {'learning_decay': 0.5, 'n_components': 10}
Meilleur score Log likelihood: -855264.2413431403
Perplexité du modèle: 841.4407986384678
# Creation d'une Matrice de topic
lda_output = best_lda_model.transform(phrases_vectorized)
topicnames = ['Topic' + str(i) for i in range(best_lda_model.n_components)]
# Création du dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames)
# Récupération du topic dominant de chaque documents
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Style
def color_green(val):
color = 'green' if val > .1 else 'black'
return 'color: {col}'.format(col=color)
def make_bold(val):
weight = 700 if val > .1 else 400
return 'font-weight: {weight}'.format(weight=weight)
# Application du style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics
# DF contenant les mots inclus dans chaque topic
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Paramètres du DF
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# Fonction qui permet d'afficher les mots les plus importants de chaque topic
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
keywords = np.array(vectorizer.get_feature_names())
topic_keywords = []
for topic_weights in lda_model.components_:
top_keyword_locs = (-topic_weights).argsort()[:n_words]
topic_keywords.append(keywords.take(top_keyword_locs))
return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=10)
# Faire un df avec les keywords de chaque Topic et essayer de nommer chaque topic.
Topics = ["Sentiment Analysis","Behavior Analysis","Knowledge Transfer","Process Management","Species Studies","Technology Development","Health Studies","Machine Learning","Journal Citation","Network Analysis"]
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords["Topics"] = Topics
df_topic_keywords
# On peut ensuite fit les descriptions avec leurs topics dominants
map_dominant_topic = df_document_topic['dominant_topic']
df_clean['dominant_topic'] = map_dominant_topic
df_clean['dominant_topic'] = df_clean['dominant_topic'].replace(list(range(10)), Topics)
df_clean
df_clean['dominant_topic'].value_counts().nlargest(5).plot(kind='bar',title=' Dominant topics',color=['C0', 'C1', 'C2', 'C3', 'C4'])
## Export des dataframes pour réaliser des phylomémies
list_export = ['Publication Day',
'Publication Month',
'Publication Year',
'Authors',
'Title',
'Source',
'Abstract']
dict_export = { 'Article Title' : 'Title',
'Source Title' : 'Source',
}
bdsk_clean = bdsk_clean.rename(columns=dict_export)
mlss_clean = mlss_clean.rename(columns=dict_export)
ktss_clean = ktss_clean.rename(columns=dict_export)
bdsk_clean[["Publication Day","Publication Month"]] = 0
mlss_clean[["Publication Day","Publication Month"]] = 0
ktss_clean[["Publication Day","Publication Month"]] = 0
bdsk_export = bdsk_clean[list_export]
mlss_export = mlss_clean[list_export]
ktss_export = ktss_clean[list_export]
bdsk_export.to_csv('/work/export_data/bdsk_export.csv',index=False,sep='\t')
mlss_export.to_csv('/work/export_data/mlss_export.csv',index=False,sep='\t')
ktss_export.to_csv('/work/export_data/ktss_export.csv',index=False,sep='\t')