%%capture
!pip install praw
!pip install psaw
import csv
from datetime import datetime
import os
import pandas as pd
import praw
from psaw import PushshiftAPI
reddit = praw.Reddit(
client_id="CE-Your-Client-id-w",
client_secret="U-Your-Client-secret-w",
user_agent="<ShowerThoughts-DataDive>"
)
start_epoch=int(datetime(2020, 1, 1).timestamp())
end_epoch=int(datetime(2021, 1, 1).timestamp())
api = PushshiftAPI(reddit)
gen = api.search_submissions(after=start_epoch, before=end_epoch,
subreddit='Showerthoughts')
data_folder = "./data_files/"
file_name = "12-2020"
CHECK_FOLDER = os.path.isdir(data_folder)
# If folder doesn't exist, then create it.
if not CHECK_FOLDER:
os.makedirs(data_folder)
print("created folder : ", data_folder)
created folder : ./data_files/
cvsWriter = csv.writer(open(f"{data_folder}{file_name}.csv", 'a', encoding="UTF-8"))
count = 0
for submission in gen:
count += 1
dt_object = datetime.fromtimestamp(submission.created_utc)
if file_name != f"{dt_object.month}-{dt_object.year}":
file_name = f"{dt_object.month}-{dt_object.year}"
print(f"Switching data files {file_name}")
cvsWriter = csv.writer(open(f"{data_folder}{file_name}.csv", 'a', encoding="UTF-8"))
cvsWriter.writerow([submission.id, submission.title, submission.author, submission.url, submission.view_count, submission.score, dt_object.month, dt_object.day, dt_object.year, dt_object.hour])
f.close()
data_holder = []
for root,dirs,files in os.walk(data_folder):
for file in files:
if file.endswith(".csv"):
with open(data_folder + file) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
try:
data_holder.append({"id":row[0], "title":row[1], "score":int(row[5]),"month":int(row[6]), "hour":int(row[9])})
except:
print(row)
line_count += 1
print(f'Processed {line_count} lines from {file}')
Processed 88757 lines from 6-2020.csv
Processed 71227 lines from 9-2020.csv
Processed 29733 lines from 1-2020.csv
Processed 57939 lines from 12-2020.csv
Processed 81881 lines from 8-2020.csv
Processed 87284 lines from 7-2020.csv
Processed 141507 lines from 3-2020.csv
Processed 65712 lines from 10-2020.csv
Processed 107825 lines from 4-2020.csv
Processed 58819 lines from 11-2020.csv
Processed 100791 lines from 2-2020.csv
Processed 101123 lines from 5-2020.csv
print(f"Total Items: {len(data_holder)}")
data_holder[0]
Total Items: 992598
def thought_stats(data):
total_score = 0
total_length = 0
max_score = 1
min_score = 1
scored_0 = 0
scored_1 = 0
scored_over_1 = 0
for thought in data:
total_score += thought['score']
total_length += len(thought['title'])
if thought['score'] > max_score:
max_score = thought['score']
elif thought['score'] < min_score:
min_score = thought['score']
if thought['score'] == 0:
scored_0 += 1
elif thought['score'] == 1:
scored_1 += 1
else:
scored_over_1 += 1
print(f"Average Score: {round(total_score / len(data),2)}\nMax Score: {max_score}\nMin Score: {min_score}")
print(f"Average Length: {round(total_length / len(data),2)}\n")
print(f"Scored 0: {scored_0} thoughts {round(scored_0/len(data)*100,2)}%")
print(f"Scored 1: {scored_1} thoughts {round(scored_1/len(data)*100,2)}%")
print(f"Got an upvote! {scored_over_1} thoughts {round(scored_over_1/len(data)*100,2)}%")
thought_stats(data_holder)
Average Score: 80.13
Max Score: 152376
Min Score: 0
Average Length: 86.48
Scored 0: 41428 thoughts 4.17%
Scored 1: 676774 thoughts 68.18%
Got an upvote! 274396 thoughts 27.64%
df = pd.DataFrame(data_holder)
df.head()
df['title_length'] = df['title'].str.len()
df.head()
df_hour = df.sort_values(by=['hour']).groupby('hour').mean()
df_hour
df_hour.plot(y='score')
df_month = df.sort_values(by=['month']).groupby('month').mean()
df_month
df_month.plot(y='score')