5/10/22 Intro to Python for Data Analysis Workshop

import pandas as pd

# Here we are importing the `pokemon.csv` file into a Pandas dataframe that we can work with pokemon = pd.read_csv('pokemon.csv') pokemon

# Creates a new instance of the dataframe where the name is the index! name_pokemon = pokemon.set_index("name")

# Here we can create our own dataframe using dictionaries students = pd.DataFrame({ "ID": [101, 100, 105, 110, 124, 99], "Name": ["Colin", "Kai", "Kyra", "William", "Michelle", "Michael"], "Major": ["DS","DS","DS","CSE","CS","Honors"]}) # You can have null values (explained in the slides) in your dataframe by specifing 'None' in your lists daily_weather = pd.DataFrame({"Day":["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"], "Weather":["Sunny", "Cloudy", "Rainy", None, None]}) daily_weather.set_index("Day", inplace = True)

pokemon.head()

pokemon[["pokedex_number", "name"]].head(2)

# Grabs the value at row index 1 and column index 2 # pokemon.iloc[1,2] # Grab all the values at row index 1 # pokemon.iloc[1] # Grab all the values from row index 0 to row index 2 pokemon.iloc[0:3] # Display values in column index 0 and 1 from row index 0 to 2 # pokemon.iloc[0:3,0:2]

# Grab the value with the row label "Charmander" at column "type1" name_pokemon.loc["Charmander", "type1"] # Grab all the rows between "Charmander" and "Squirtle" (inclusive) name_pokemon.loc["Charmander": "Squirtle"]

pokemon["name"].describe() # pokemon.describe() # pokemon.describe(include = "all")

pokemon.info()

pokemon.shape

pokemon.columns # pokemon.values

pokemon["type1"].value_counts()

pokemon[["name", "hp", "attack"]].nsmallest(5, ["hp"])

# Give me the Pokemon's names and hp whose hp is lower than 70 pokemon[["name","hp"]][pokemon["hp"] < 70]

# Give me Pokemon whose name contains with "Cha" pokemon[pokemon["name"].str.contains("Cha")]

# Give me Pokemon whose names contain "Chan" pokemon[pokemon["name"].str.match(r"Chan")]

# Look at every single value in the dataframe and return True if it's null, False otherwise pokemon.isna()

# "Tell me which columns have null values and which ones don't" pokemon.isna().any() # Give me the dataframe rows where their "type2" is null # pokemon[pokemon["type2"].isna()] # Gives the column names in which only null values appear pokemon.columns[pokemon.isna().any()]

# Drop the "type2" column (this line of code doesn't have to pertain to null values. It just allows you to drop any row/column) pokemon.drop(labels = "type2", axis = 1)

# "If you find a null value, drop the row that contains the null value" pokemon.dropna(axis = 1)

# "If you find a null value in the dataframe, replace it with 'NO INFO'" pokemon.fillna("NO INFO")

pokemon.head()

pokemon.sort_values(by='height_m', ascending=False).head(5)

We can also sort individual columns as Series! Let's just get the heights of the 5 tallest Pokemon.

pokemon['height_m'].sort_values(ascending = False)[0:5]

pokemon['weight_kg']

pokemon['weight_kg'] * 2.205

pokemon[['attack', 'sp_attack']]

pokemon['attack'] + pokemon['sp_attack']

# Creates a Series containing the sum of all a Pokemon's individual stat values # Challenge: Is there a faster way to get these 6 columns using what you learned about column indexing? total_stats = ['hp'] + pokemon['attack'] + pokemon['defense'] + pokemon['sp_attack'] + pokemon['sp_defense'] + pokemon['speed'] pokemon['total_stat'] = total_stats # Adds the Series as a new column in the Dataframe called 'total_stat' pokemon[['name', 'total_stat']] # Displays the Pokemon's name and sum of their base statspokemon

def determine_bmi_category(row: pd.Series) -> str: # When we call apply on an entire Dataframe, the data one one row is stored within a Series weight, height = row.weight_kg, row.height_m bmi = weight / height # Calculate the BMI of the Pokemon # Determine how the Pokemon's BMI should be classified if bmi < 18.5: return "Underweight" elif bmi >= 18.5 and bmi <= 25: return "Normal Range" elif bmi > 25: return "Overweight" else: return "No BMI data available" # We do not have weight/height data for some Pokemon # Determine each Pokemon's BMI classification from their weight and height # The axis=1 argument specifies that we want this function applied to every row in the Dataframe BMI_categories = pokemon.apply(determine_bmi_category, axis=1) # Make a new column called "bmi_category" storing these categories pokemon['bmi_category'] = BMI_categories # Visualize this new column pokemon[['name', 'weight_kg', 'height_m', 'bmi_category']]

def convert_to_lb(weight_kg: float) -> float: return weight_kg * 2.205 pokemon['weight_kg'].apply(convert_to_lb) # If you are familiar with Python lambdas, you may achieve this without the need to write a separate named function # pokemon['weight_kg'].apply(lambda weight_kg: weight_kg * 2.205)

japanese_names = pd.read_csv('pokemon_japanese_names.csv') japanese_names.head()

pokemon.merge(japanese_names, on='pokedex_number')

pokemon['hp']

pokemon['hp'].mean()

pokemon['type1'].nunique()

# 1. Create groups where each group represents a unique generation # 2. Keep track of Pokemon speed stats in these groups # 3. Calculate the average speed stat of Pokemon belonging to that generation # 4. Sort averages asending pokemon.groupby('generation').speed.mean().sort_values()

pokemon.groupby('is_legendary').total_stat.describe()

pokemon.groupby('type1').pokedex_number.count() + pokemon.groupby('type2').pokedex_number.count() # What other function could you call instead of groupby to answer this same question?

# 1. A pokemon is either a legendary or not. Create one group for each category # 2. There are seven Pokemon generations in this dataset. Create one sub-group for each generation. # 3. Count the number of Pokemon in each (is_legendary, generation) grouping # sort_index() ensures that generation values are presented in order pokemon.groupby(['is_legendary', 'generation']).pokedex_number.count().sort_index()

def print_column_values(passin_list): """ Function takes in a list of values and prints them in intervals of 25 values. So if we have a list of 50 values, we print out the first 25 values in one line and then the remaining 25 values in a second line. We will only be using this function in the next cell below. It is not important to learn how this function works. (You will learn how this code works in ICS 33) Returns: Nothing because it only prints values :D """ poke_iterable = iter(passin_list) try: every_25 = list() cutoff_count = 25 while True: for _ in range(cutoff_count): value = next(poke_iterable) every_25.append(value) print(every_25) every_25.clear() except: if len(every_25) != 0: print(every_25)

""" Code Below essentially answers this question: What rows in my dataframe have null values? Using the `pokemon.columns[pokemon.isna().any()]` code two cells ago, we create a panda series that we can iterate over. We can iterate over it because it is a container of values. Let null_locations be a dictionary whose keys are the column names that contain the null values. The value of each key is a list of indexes which indicate "where exactly in my column does a null value appear" By the end of this, null_locations should have something like this: {"type2":[1,2,3,4,5], "hp":[1,5,6,7,9]} This tell us that rows 1,2,3,4,5 have null values in the "type2" column and rows 1,5,6,7,9 have null values in the "hp" column """ null_locations = dict() for column_name in pokemon.columns[pokemon.isna().any()]: """ For Loops does 3 things: 1. Go to each column where a null value appears (in for loop statement above) 2. Give me a list of indexes where the null values appear in that row `pokemon.index[pokemon[column_name].isna()].tolist()` 3. Add the column name and the relevant row indexes to the dictionary """ rows_where_column_null = pokemon.index[pokemon[column_name].isna()].tolist() null_locations[column_name] = rows_where_column_null for k,v in null_locations.items(): """ The for loop just prints out what we have in the dictionary :D """ print(f"{k} column:") print_column_values(v)

pokemon.query("hp < 70 & name.str.contains('Cha')") # The equivalent way to get the same results is: # pokemon[(pokemon["hp"] < 70) & (pokemon["name"].str.contains('Cha'))] # DO NOT FORGET THE PARENTHESES!

pokemon.plot.scatter(x = 'hp', y = 'attack')