# Create the years and durations lists
years = [2011,2012,2013,2014,2015,2016,2017,2018, 2019,2020]
durations = [103,101,99,100,100,95,95,96,93,90]
# Create a dictionary with the two lists
movie_dict = {'years':years,'duration':durations}
# Print the dictionary
movie_dict
# Import pandas under its usual alias
import pandas as pd
# Create a DataFrame from the dictionary
duration_df = pd.DataFrame(movie_dict)
# Print the DataFrame
duration_df
# Import matplotlib.pyplot under its usual alias and create a figure
import matplotlib.pyplot as plt
fig = plt.figure()
# Draw a line plot of release_years and durations
plt.plot(duration_df['years'],duration_df['duration'])
# Create a title
plt.title('Netflix Movie Duration 2011-2020')
# Show the plot
plt.show()
# Read in the CSV as a DataFrame
netflix_df = pd.read_csv('netflix_data.csv')
# Print the first five rows of the DataFrame
netflix_df
# Subset the DataFrame for type "Movie"
netflix_df_movies_only = netflix_df[netflix_df['type'] == 'Movie']
# Select only the columns of interest
netflix_movies_col_subset = netflix_df_movies_only[['title','country','genre','release_year','duration']]
# Print the first five rows of the new DataFrame
netflix_movies_col_subset.head(5)
# Create a figure and increase the figure size
fig = plt.figure(figsize=(12,8))
# Create a scatter plot of duration versus year
plt.scatter(netflix_movies_col_subset['duration'],netflix_movies_col_subset['release_year'])
# Create a title
plt.title('Movie Duration by Year of Release')
# Show the plot
plt.show()
# Los gráficos de dispersión nos permite ver más fácilmente valores atípicos
# Filter for durations longer than 300 minutes
longer_movie = netflix_movies_col_subset[netflix_movies_col_subset['duration'] > 300]
# Print longer_movie
longer_movie
# Filter for durations shorter than 60 minutes
short_movies = netflix_movies_col_subset[netflix_movies_col_subset['duration'] < 60]
# Print the first 20 rows of short_movies
short_movies.head(20)
# Define an empty list
colors = []
# Iterate over rows of netflix_movies_col_subset
for i,row in netflix_movies_col_subset.iterrows():
if row['genre'] == 'Children':
colors.append('red')
elif row['genre'] == 'Documentaries':
colors.append('blue')
elif row['genre'] == 'Stand-up':
colors.append('green')
else:
colors.append('black')
# Inspect the first 10 values in your list
colors[:10]
# Set the figure style and initalize a new figure
plt.style.use('fivethirtyeight')
plt.figure(figsize=(10,8))
# Create a scatter plot of duration versus release_year
plt.scatter(netflix_movies_col_subset["release_year"], netflix_movies_col_subset["duration"], c=colors)
# Create a title and axis labels
plt.title("Movie duration by year of release")
plt.xlabel("Release year")
plt.ylabel("Duration (min)")
# Show the plot
plt.show()