Run this line first to import the Pandas library!
import pandas as pd
There are multiple ways to create a Pandas Dataframe. Here are two of the most common ways:
Dictionaries are another type of list where each element has a key and value. The key has to be unique.
# Here we can create our own dataframe using dictionaries
students = pd.DataFrame({
"ID": [101, 100, 105, 110, 124, 99],
"Name": ["Andrew", "Sai", "Andy", "Zeeshan", "Juan", "Jake"],
"Major": ["DS","BME","DS","CSE","CS","DS"]})
students
CSV stands for comma separated values where columns are distinguished by commas.
# Create a Pandas Dataframe using the 'movie_dataset.csv' file
movies = pd.read_csv('movie_dataset.csv')
movies
# Display a summary of the DataFrame, including column name, non-null values, and their data types
movies.info()
# Return the number of rows and columns in the DataFrame
movies.shape
# List the column labels as an index object
movies.columns
# Return a statistical summary of the DataFrame
movies.describe()
.head() &.tail()
movies.head()
movies[["genres", "title"]].head(2)
.iloc[] & .loc[]
# Grabs the value at row index 0 and column index 7 (name)
movies.iloc[0,7]
# Grab all the values at row position 1
movies.iloc[1]
# Grab all the values from row position 0 to row position 2
movies.iloc[0:3]
# Display values in columns 0 and 1 from rows 0 to 3
movies.iloc[0:3,0:2]
# Choosing specific row numbers using iloc
movies.iloc[[0, 10, 20, 30]]
# Grab the movie title of the first row
movies.loc[0, "title"]
# Grab all the rows between indexes 0 and 5 (inclusive)
movies.loc[0: 5]
We can filter rows in our data frame based on boolean expressions.
# Give me the names of the movies with revenue greater than $30,000,000
movies[["title", "revenue"]][movies["revenue"] > 30000000]
# Give me movies whose names contain "Super"
movies[movies["title"].str.contains("Super")]
# Give me movies whose name starts with "Super"
movies[movies["title"].str.match("Super")]
# Give me movies with runtime more than 120 minutes
movies.loc[movies["runtime"] > 120, ["title", "runtime"]]
# Look at every single value in the dataframe and return True if it's null, False otherwise
movies.isna()
# Returns True if any value in the series is null
movies.isna().any()
# Gives the column names in which null values appear
movies.columns[movies.isna().any()]
# Give me the rows where their "genre" is null
movies[movies["genres"].isna()]
# Drop the "tagline" column (this line of code doesn't have to pertain to null values. It just allows you to drop any row/column)
movies.drop(labels = "tagline", axis = 1)
# "If you find a null value, drop the rows that contains the null value"
movies.dropna(axis = 0)
# "If you find a null value in the dataframe, replace it with 'NO INFO'"
movies.fillna("NO INFO")
movies['runtime'].mean()
movies['genres'].nunique()
# 1. Create groups where each group represents a unique genre
# 2. Keep track of movie runtimes in these groups
# 3. Find the minimum runtime and maximum runtime of movies belonging to that genre
movies.groupby('genres').runtime.agg([min, max])
# 1. Create groups where each group represents a unique genre
# 2. Keep track of movie runtime in these groups
# 3. Calculate the average runtime of movies belonging to each genre
# 4. Sort averages ascending
movies.groupby('genres').runtime.mean().sort_values()
# 1. Create groups where each group represents a unique genre
# 2. Keep track of movie revenue in these groups
# 3. Calculate the sum of the revenue of movies belonging to each genre
# 4. Sort by ascending
movies.groupby('genres').revenue.sum().sort_values()