# Import pandas to create a data frame
import pandas as pd
import numpy as np
# Import the libraries we'll use to create graphs
from matplotlib import pyplot as plt
# This command makes sure the graphs show in this window, not a new one
%matplotlib inline
# From "Exploring Data"
# import the csv file as a dataframe
# call the dataframe "df"
df = pd.read_csv('books.csv',engine='python')
# Let's print the column names so we remember what they are when we're calling them to make g
df.columns
# Make a scatter plot using matplotlib
fig, ax = plt.subplots()
ax.scatter(df['num_pages'],df['average_rating']) # note this takes the form of (x,y)
# label axes and add a title
ax.set_title('Number of Pages vs. Average Rating for Goodreads Books') # set the title
ax.set_xlabel('Number of Pages') # set a label for the x (horizontal) axis
ax.set_ylabel('Average Rating') # set a label for the y (vertical) axis
# Question 1 Code Below
fig, ax = plt.subplots()
ax.scatter(df['ratings_count'],df['text_reviews_count'])
# label axes and add a title
ax.set_title('Ratings Count vs. Text Reviews Count') # set the title
ax.set_xlabel('Ratings Count') # set a label for the x (horizontal) axis
ax.set_ylabel('Text Review Count') # set a label for the y (vertical) axis
# Plot a histogram of the number of pages
plt.hist(df['num_pages'],bins=20) # you can change the number of bins to see more or less det
# Also print the descriptive statistics for the Energy variable
print(df['num_pages'].describe())
# Question 2 Code Here
# Plot a histogram of the number of pages
plt.hist(df['average_rating'],bins=5)
print(df['average_rating'].describe())
Question 3: I found a bin size of 5 was most helpful because all ratings were between 0 and 5, and a bin size of 5 showed that just over half of the ratings were between 3 and 4, and the rest between 4 and 5 with a few small outliers.