pip install pmaw newspaper3k
!pip install pmaw newspaper3k
import datetime as dt
from pprint import pprint
from pmaw import PushshiftAPI
api = PushshiftAPI()
q = "vaccine|vaccination|vaccinate|vaccinates|vaccinating|vaccinated|vax|vaxx|antivax|antivaxx|anti-vax|anti-vaxx"
submissions = api.search_submissions(
q=q, # query
after=int(dt.datetime(2021, 1, 1, 0, 0, 0).timestamp()),
before=int(dt.datetime(2021, 12, 31, 23, 59, 59).timestamp()),
subreddit="news",
limit=10,
)
# We convert the special Response object to a list of dictionaries that is easier to work with
submissions = list(submissions)
pprint(submissions[0]) # print first result
urls = set()
for submission in submissions:
urls.add(submission["url"])
pprint(urls)
from newspaper import Article
data = []
for url in urls:
article = Article(url)
article.download()
article.parse()
datum = dict()
datum["title"] = article.title
datum["authors"] = article.authors
datum["top_image"] = article.top_image
datum["text"] = article.text
datum["publish_date"] = article.publish_date
datum["url"] = url
data.append(datum)
# [{},{}]
pprint(data)
import pandas as pd
df = pd.DataFrame(data)
df
This chart is empty
Chart was probably not set up properly in the notebook
df.to_csv("data.csv", sep="\t")