### utility modules
from csv import reader
import datetime as dt
### utility functions
# opens the file and returns the header and the data
def open_dataset(file_path):
opened_file = open(file_path)
read_file = reader(opened_file)
dataset = list(read_file)
dataset_header = dataset[0]
dataset = dataset[1:]
return dataset_header, dataset
# prints header and row information of a dataset
def explore_data(dataset, header, start, end):
dataset_slice = dataset[start:end]
print(header, "\n")
for row in dataset_slice:
print(row)
print("\nNumber of rows", len(dataset))
print("Number of columns", len(dataset[0]))
# returns the average number of comments for a posts category
def find_average_num_comments(posts):
num_comments = 0
for post in posts:
num_comments += int(post[4])
avg_num_comments = num_comments / len(posts)
return round(avg_num_comments, 2)
# returns the post counts and comment counts by hour created
def find_post_and_comment_counts_by_hour(posts):
formatted_posts = []
for post in posts:
formatted_posts.append([post[6], int(post[4])])
counts_by_hour = {}
comments_by_hour = {}
for created_at, num_comments in formatted_posts:
date_object = dt.datetime.strptime(created_at, "%m/%d/%Y %H:%M")
hour = date_object.strftime("%H")
if hour in counts_by_hour:
counts_by_hour[hour] += 1
comments_by_hour[hour] += num_comments
else:
counts_by_hour[hour] = 1
comments_by_hour[hour] = num_comments
return counts_by_hour, comments_by_hour
# displays a given frequency table, sorted from highest to lowest
def display_freq_table(freq_table, label = ""):
converted_freq_table = []
for key in freq_table:
converted_freq_table.append((freq_table[key], key))
for value, key in sorted(converted_freq_table, reverse=True)[:5]:
print(key, ":", value, label)
# extract the dataset, separating the header from the rest of the data
hn_header, hn = open_dataset("HN_posts_year_to_Sep_26_2016.csv")
# print header and row information of the hn dataset
explore_data(hn, hn_header, 0, 5)
['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']
['12579008', 'You have two days to comment if you want stem cells to be classified as your own', 'http://www.regulations.gov/document?D=FDA-2015-D-3719-0018', '1', '0', 'altstar', '9/26/2016 3:26']
['12579005', 'SQLAR the SQLite Archiver', 'https://www.sqlite.org/sqlar/doc/trunk/README.md', '1', '0', 'blacksqr', '9/26/2016 3:24']
['12578997', 'What if we just printed a flatscreen television on the side of our boxes?', 'https://medium.com/vanmoof/our-secrets-out-f21c1f03fdc8#.ietxmez43', '1', '0', 'pavel_lishin', '9/26/2016 3:19']
['12578989', 'algorithmic music', 'http://cacm.acm.org/magazines/2011/7/109891-algorithmic-composition/fulltext', '1', '0', 'poindontcare', '9/26/2016 3:16']
['12578979', 'How the Data Vault Enables the Next-Gen Data Warehouse and Data Lake', 'https://www.talend.com/blog/2016/05/12/talend-and-Â\x93the-data-vaultÂ\x94', '1', '0', 'markgainor1', '9/26/2016 3:14']
Number of rows 293119
Number of columns 7
ask_hn_posts = []
show_hn_posts = []
other_hn_posts = []
# seperate the posts into ask_hn, show_hn and other_hn
for post in hn:
title = post[1].lower()
if title.startswith("ask hn"):
ask_hn_posts.append(post)
elif title.startswith("show hn"):
show_hn_posts.append(post)
else:
other_hn_posts.append(post)
# print number of posts in each category
template = "{}: {} | {:.2f}%"
print(template.format("Number of Ask HN Posts", len(ask_hn_posts), len(ask_hn_posts) / len(hn) * 100))
print(template.format("Number of Show HN Posts", len(show_hn_posts), len(show_hn_posts) / len(hn) * 100))
print(template.format("Number of Other HN Posts", len(other_hn_posts), len(other_hn_posts) / len(hn) * 100))
Number of Ask HN Posts: 9139 | 3.12%
Number of Show HN Posts: 10158 | 3.47%
Number of Other HN Posts: 273822 | 93.42%
# finds the average number of comments for each posts category
avg_ask_comments = find_average_num_comments(ask_hn_posts)
avg_show_comments = find_average_num_comments(show_hn_posts)
avg_other_comments = find_average_num_comments(other_hn_posts)
# prints that information in human-readable form
template = "{:}: {} comments per post"
print(template.format("Avg Ask HN", avg_ask_comments))
print(template.format("Avg Show HN", avg_show_comments))
print(template.format("Avg Other HN", avg_other_comments))
Avg Ask HN: 10.39 comments per post
Avg Show HN: 4.89 comments per post
Avg Other HN: 6.46 comments per post
# finds the number of counts and comments by hour of the post created for the ask_hn_posts category
ask_counts_by_hour, ask_comments_by_hour = find_post_and_comment_counts_by_hour(ask_hn_posts)
print("\nNumber of comments by hour post created\n---")
display_freq_table(ask_comments_by_hour, "posts")
Number of comments by hour post created
---
15 : 18525 posts
13 : 7245 posts
17 : 5547 posts
14 : 4972 posts
18 : 4877 posts
# finds the number of counts and comments by hour of the post created for the show_hn_posts category
show_counts_by_hour, show_comments_by_hour = find_post_and_comment_counts_by_hour(show_hn_posts)
print("\nNumber of comments by hour post created\n---")
display_freq_table(show_comments_by_hour, "posts")
Number of comments by hour post created
---
14 : 3839 posts
15 : 3824 posts
16 : 3769 posts
12 : 3609 posts
13 : 3314 posts
# finds the number of counts and comments by hour of the post created for the show_hn_posts category
other_counts_by_hour, other_comments_by_hour = find_post_and_comment_counts_by_hour(other_hn_posts)
print("\nNumber of comments by hour post created\n---")
display_freq_table(other_comments_by_hour, "posts")
Number of comments by hour post created
---
17 : 118217 posts
16 : 116322 posts
15 : 115286 posts
18 : 112502 posts
14 : 108277 posts
avg_ask_comments_by_hour = {}
for hour in ask_counts_by_hour:
avg_ask_comments_by_hour[hour] = round(ask_comments_by_hour[hour] / ask_counts_by_hour[hour], 2)
print("\nAvg number of comments by hour post created\n---")
display_freq_table(avg_ask_comments_by_hour, "avg comments")
Avg number of comments by hour post created
---
15 : 28.68 avg comments
13 : 16.32 avg comments
12 : 12.38 avg comments
02 : 11.14 avg comments
10 : 10.68 avg comments
avg_show_comments_by_hour = {}
for hour in show_counts_by_hour:
avg_show_comments_by_hour[hour] = round(show_comments_by_hour[hour] / show_counts_by_hour[hour], 2)
print("\nAvg number of comments by hour post created\n---")
display_freq_table(avg_show_comments_by_hour, "avg comments")
Avg number of comments by hour post created
---
12 : 6.99 avg comments
07 : 6.68 avg comments
11 : 6.0 avg comments
08 : 5.6 avg comments
14 : 5.52 avg comments
avg_other_comments_by_hour = {}
for hour in other_counts_by_hour:
avg_other_comments_by_hour[hour] = round(other_comments_by_hour[hour] / other_counts_by_hour[hour], 2)
print("\nAvg number of comments by hour post created\n---")
display_freq_table(avg_other_comments_by_hour, "avg comments")
Avg number of comments by hour post created
---
12 : 7.59 avg comments
11 : 7.37 avg comments
02 : 7.18 avg comments
13 : 7.15 avg comments
05 : 6.79 avg comments