U1L3 Pandas

# Import pandas to create a data frame import pandas as pd import numpy as np # Import the libraries we'll use to create graphs from matplotlib import pyplot as plt # This command makes sure the graphs show in this window, not a new one %matplotlib inline

# From "Exploring Data" # import the csv file as a dataframe # call the dataframe "df" df = pd.read_csv('books.csv',engine='python')

# Let's print the column names so we remember what they are when we're calling them to make g df.columns

# Make a scatter plot using matplotlib fig, ax = plt.subplots() ax.scatter(df['num_pages'],df['average_rating']) # note this takes the form of (x,y) # label axes and add a title ax.set_title('Number of Pages vs. Average Rating for Goodreads Books') # set the title ax.set_xlabel('Number of Pages') # set a label for the x (horizontal) axis ax.set_ylabel('Average Rating') # set a label for the y (vertical) axis

# Question 1 Code Below fig, ax = plt.subplots() ax.scatter(df['ratings_count'],df['text_reviews_count']) # label axes and add a title ax.set_title('Ratings Count vs. Text Reviews Count') # set the title ax.set_xlabel('Ratings Count') # set a label for the x (horizontal) axis ax.set_ylabel('Text Review Count') # set a label for the y (vertical) axis

# Plot a histogram of the number of pages plt.hist(df['num_pages'],bins=20) # you can change the number of bins to see more or less det # Also print the descriptive statistics for the Energy variable print(df['num_pages'].describe())

# Question 2 Code Here # Plot a histogram of the number of pages plt.hist(df['average_rating'],bins=5) print(df['average_rating'].describe())

Question 3: I found a bin size of 5 was most helpful because all ratings were between 0 and 5, and a bin size of 5 showed that just over half of the ratings were between 3 and 4, and the rest between 4 and 5 with a few small outliers.