# Importing modules
# -- YOUR CODE HERE --
import pandas as pd
# Read datasets/papers.csv into papers
papers = pd.read_csv('datasets/papers.csv')
# Print out the first rows of papers
# -- YOUR CODE HERE --
print(papers.head())
# Remove the columns
# -- YOUR CODE HERE --
papers.drop(columns=['id','event_type','pdf_name' ], inplace=True)
# Print out the first rows of papers
# -- YOUR CODE HERE --
print(papers.head())
# Group the papers by year
groups = papers.groupby('year')
# Determine the size of each group
counts = groups.size()
# Visualise the counts as a bar plot
import matplotlib.pyplot as plt
%matplotlib inline
# -- YOUR CODE HERE --
counts.plot()
plt.title("Machine Learning Publications since 1987")
plt.xlabel("Year");
# Load the regular expression library
# -- YOUR CODE HERE --
import re
# Print the titles of the first rows
print(papers['title'].head())
# Remove punctuation
papers['title_processed'] = papers['title'].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
papers['title_processed'] = papers['title_processed'].map(lambda x: x.lower())
# Print the processed titles of the first rows
# -- YOUR CODE HERE --
print(papers['title_processed'])
# Import the wordcloud library
# -- YOUR CODE HERE --
import wordcloud
# Join the different processed titles together.
long_string = " ".join(papers.title_processed)
# Create a WordCloud object
wordcloud = wordcloud.WordCloud()
# Generate a word cloud
# -- YOUR CODE HERE --
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
import matplotlib.pyplot as plt
words = count_vectorizer.get_feature_names()
total_counts = np.zeros(len(words))
for t in count_data:
total_counts+=t.toarray()[0]
count_dict = (zip(words, total_counts))
count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
words = [w[0] for w in count_dict]
counts = [w[1] for w in count_dict]
x_pos = np.arange(len(words))
plt.bar(x_pos, counts,align='center')
plt.xticks(x_pos, words, rotation=90)
plt.xlabel('words')
plt.ylabel('counts')
plt.title('10 most common words')
plt.show()
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(papers['title_processed'])
# Visualise the 10 most common words
# -- YOUR CODE HERE --
plot_10_most_common_words(count_data, count_vectorizer)
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
words = count_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
print("\nTopic #%d:" % topic_idx)
print(" ".join([words[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
# Tweak the two parameters below (use int values below 15)
number_topics = 5
number_words = 5
# Create and fit the LDA model
lda = LDA(n_components=number_topics)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)
# The historical data indicates that:
more_papers_published_in_2018 = True