# Import pandas to create a data frame
import pandas as pd
import numpy as np
# Import the libraries we'll use to create graphs
from matplotlib import pyplot as plt
# This command makes sure the graphs show in this window, not a new one
%matplotlib inline
# From "Exploring Data"
# import the csv file as a dataframe
# call the dataframe "df"
df = pd.read_csv('books.csv',engine='python')
# Let's print the column names so we remember what they are when we're calling them to make g
df.columns
# Make a scatter plot using matplotlib
fig, ax = plt.subplots()
ax.scatter(df['num_pages'],df['average_rating']) # note this takes the form of (x,y)
# label axes and add a title
ax.set_title('Number of Pages vs. Average Rating for Goodreads Books') # set the title
ax.set_xlabel('Number of Pages') # set a label for the x (horizontal) axis
ax.set_ylabel('Average Rating') # set a label for the y (vertical) axis
# Question 1 Code Below
fig, ax = plt.subplots()
ax.scatter(df['ratings_count'],df['text_reviews_count'])
# label axes and add a title
ax.set_title('Ratings Count vs. Text Reviews Count') # set the title
ax.set_xlabel('Ratings Count') # set a label for the x (horizontal) axis
ax.set_ylabel('Text Review Count') # set a label for the y (vertical) axis
# Plot a histogram of the number of pages
plt.hist(df['num_pages'],bins=20) # you can change the number of bins to see more or less det
# Also print the descriptive statistics for the Energy variable
print(df['num_pages'].describe())
count 1928.000000
mean 347.573133
std 263.456455
min 0.000000
25% 195.000000
50% 290.500000
75% 432.000000
max 3342.000000
Name: num_pages, dtype: float64
# Question 2 Code Here
# Plot a histogram of the number of pages
plt.hist(df['average_rating'],bins=5)
print(df['average_rating'].describe())
count 1928.000000
mean 3.942879
std 0.357928
min 0.000000
25% 3.780000
50% 3.970000
75% 4.140000
max 5.000000
Name: average_rating, dtype: float64
Question 3: I found a bin size of 5 was most helpful because all ratings were between 0 and 5, and a bin size of 5 showed that just over half of the ratings were between 3 and 4, and the rest between 4 and 5 with a few small outliers.