Investigation Netflix

# Create the years and durations lists years = [2011,2012,2013,2014,2015,2016,2017,2018, 2019,2020] durations = [103,101,99,100,100,95,95,96,93,90] # Create a dictionary with the two lists movie_dict = {'years':years,'duration':durations} # Print the dictionary movie_dict

# Import pandas under its usual alias import pandas as pd # Create a DataFrame from the dictionary duration_df = pd.DataFrame(movie_dict) # Print the DataFrame duration_df

# Import matplotlib.pyplot under its usual alias and create a figure import matplotlib.pyplot as plt fig = plt.figure() # Draw a line plot of release_years and durations plt.plot(duration_df['years'],duration_df['duration']) # Create a title plt.title('Netflix Movie Duration 2011-2020') # Show the plot plt.show()

# Read in the CSV as a DataFrame netflix_df = pd.read_csv('netflix_data.csv') # Print the first five rows of the DataFrame netflix_df

# Subset the DataFrame for type "Movie" netflix_df_movies_only = netflix_df[netflix_df['type'] == 'Movie'] # Select only the columns of interest netflix_movies_col_subset = netflix_df_movies_only[['title','country','genre','release_year','duration']] # Print the first five rows of the new DataFrame netflix_movies_col_subset.head(5)

# Create a figure and increase the figure size fig = plt.figure(figsize=(12,8)) # Create a scatter plot of duration versus year plt.scatter(netflix_movies_col_subset['duration'],netflix_movies_col_subset['release_year']) # Create a title plt.title('Movie Duration by Year of Release') # Show the plot plt.show() # Los gráficos de dispersión nos permite ver más fácilmente valores atípicos

# Filter for durations longer than 300 minutes longer_movie = netflix_movies_col_subset[netflix_movies_col_subset['duration'] > 300] # Print longer_movie longer_movie

# Filter for durations shorter than 60 minutes short_movies = netflix_movies_col_subset[netflix_movies_col_subset['duration'] < 60] # Print the first 20 rows of short_movies short_movies.head(20)

# Define an empty list colors = [] # Iterate over rows of netflix_movies_col_subset for i,row in netflix_movies_col_subset.iterrows(): if row['genre'] == 'Children': colors.append('red') elif row['genre'] == 'Documentaries': colors.append('blue') elif row['genre'] == 'Stand-up': colors.append('green') else: colors.append('black') # Inspect the first 10 values in your list colors[:10]

# Set the figure style and initalize a new figure plt.style.use('fivethirtyeight') plt.figure(figsize=(10,8)) # Create a scatter plot of duration versus release_year plt.scatter(netflix_movies_col_subset["release_year"], netflix_movies_col_subset["duration"], c=colors) # Create a title and axis labels plt.title("Movie duration by year of release") plt.xlabel("Release year") plt.ylabel("Duration (min)") # Show the plot plt.show()