Pandas
Pandas data types
Pandas series:
Pandas dataframe
Create a pandas DataFrame
Create a Dataframe from a Dictionary
# Pre-defined lists
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
dr = [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]
# Import pandas as pd
import pandas as pd
# Create dictionary my_dict with three key:value pairs: my_dict
my_dict = {"country":names, "drives_right":dr, "cars_per_cap":cpc}
# Build a DataFrame cars from my_dict: cars
cars = pd.DataFrame(my_dict)
# Print cars
print(cars)
# Definition of row_labels
row_labels = ['US', 'AUS', 'JPN', 'IN', 'RU', 'MOR', 'EG']
# Specify row labels of cars
cars.index = row_labels
# Print cars again
print(cars)
Create a DataFrame from CSV.
import pandas as pd
#import the bestsellers-with-categories.csv
mydata = pd.read_csv("bestsellers-with-categories.csv")
#pd.read_csv() receives an argument sep="", which is the separator
#pd.read_csv
print(mydata)
Create a DataFrame from JSON.
myjson_data = pd.read_json("hpcharactersdataraw.json")
print(myjson_data)
Data accessing in Pandas
Column accessing
print(cars)
print( cars[["country"]] )
#what would be this data type?
print( type(cars[["country"]]) )
print( cars[["country", "cars_per_cap"]] )
Row accessing
#say i want to access rows from 2nd to 4th
print( cars[1:4] )
Row accessing by Row Label -> loc[]
print(cars)
print( cars.loc[["AUS"]] )
print( cars.loc[["AUS", "JPN"]] )
print( cars.loc[ ["AUS", "JPN"] , ["country", "cars_per_cap"] ] )
filtering with loc
condition = cars.loc[:, ["country"]] == "Japan"
Row accessing by Row index -> iloc[]
print(cars)
print( cars.iloc[[1]] )
print( cars.iloc[[1,2]] )
print( cars.iloc[ [1,2] , [0, 2] ] )
Add or delete data with pandas
import pandas as pd
#let's import the data
my_data = pd.read_csv("bestsellers-with-categories.csv")
print((my_data.head(2)))
Delete columns - df.drop()
#axis = 0 for rows & axis = 1 for columns. Since i want to delete columns, i put axis = 1
print("-----modified output:-----")
print ( my_data.drop(["Name"], axis=1).head(2) )
#notice that the original dataset still contains the column
print("-----original data:-----")
print(my_data.head(2))
my_data.drop("Name", axis = 1, inplace=True)
#notice that the column has been removed now from the dataset
print(my_data.head(2))
Delete rows - df.drop()
print("-----original dataframe:-----")
print(my_data.head(5))
#delete the first row
print("-----without first row:-----")
print (my_data.drop(0, axis = 0).head(5))
Add columns
import pandas as pd
import numpy as np
#to add a column to our dataframe, we must use np.nan
#original dataset
print(my_data.head(5))
#let's add a column called "Country"
#nan is a no numeric value
my_data["Country"] = np.nan
#print the modified dataframe
print(my_data.head(5))
adding an index to the data
#first count the rows in your DataFrame
print( my_data.shape[0] )
#now create a numpy array with that length
arrayayay = np.arange(0, my_data.shape[0])
#now create the column that will assign the item to the data and assign that array to it
#(yes, you can assign numpy arrays as column values)
my_data["indexx"] = arrayayay
print(my_data)
Add rows
#let's append the data to the list
print(my_data.append(my_data))
Deal with Null Data
import numpy as np
import pandas as pd
dicttt = {
"Col1":[1, 2, 3, np.nan],
"Col2":[4, np.nan, 6, 7],
"col3":["a", "b", "c", None]
}
df = pd.DataFrame(dicttt)
print(df)
identify null data in a DF
#isnull() will help you to identify null values
print("-----Boolean dataframe: ------")
print( df.isnull() )
#if you want to turn those values into numeric:
print("-----Numeric dataframe: ------")
print( df.isnull()*1 )
replace null values
replace for a string
#i will replace null values for the word "missing"
print( df.fillna("missing") )
replace for column mean
print( df.fillna(df.mean()) )
replace null values for an interpolation
print( df.interpolate() )
delete null values
print( df.dropna() )
Filtering with conditions
#let's import the dataset
import pandas as pd
data = pd.read_csv("bestsellers-with-categories.csv")
#first 5 rows
print(data.head(5))
print( data[ data["User Rating"] >= 4.8 ] )
print( data[ data["Genre"] == "Fiction" ] )
Filtering with multiple conditions
Fiction = data["Genre"] == "Fiction"
Publishedafter2018 = data["Year"] >= 2018
print(data[ Fiction & Publishedafter2018])
Column Information
know columns data type
# .info() will tell you the data type of each column
print( data.info() )
change the column data type
#choose the column and use .astype() to select your desired data type
data["Name"] = data["Name"].astype("string")
data["Author"] = data["Author"].astype("string")
data["Genre"] = data["Genre"].astype("string")
#and i will check the data type again
data.info()
know column main statistics
# .describe() will tell you main descriptive variables
print( data.describe() )
see last registers from columns
# .tail() will show you the last n registers. n is the argument
print( data.tail(2) )
see memory usage of each column
# .memory_usage(deep = True) #the deep argument is importnt to do it column based
print( data.memory_usage(deep=True) )
count records in a column
#count records from each author in a column
data["Author"].value_counts()
delete duplicates
#let's get all the unique values for the user ratings
print( data["User Rating"].drop_duplicates() )
DELETE DUPLICATES AND KEEP LAST!!!!!
#show the last book that got a unique rating
#THE ARGUMENT keep="last" will make you to keep the last record on this one.
print( data["User Rating"].drop_duplicates(keep="last") )
Column sorting (ascending)
#let's sort ascendingly by user rating
print( data.sort_values("User Rating", ascending=True) )
Column sorting (descending)
#let's sort descendingly by user rating
print( data.sort_values("User Rating", ascending=False) )
Group by
#this is my dataset
print ( data.head(5) )
print ( data.groupby("Author").count() )
print(data.groupby("Author").median())
print(data.groupby("Author").sum().loc["William Davis"])
print(data.groupby("Author").sum().reset_index())
.agg()
print( data.groupby("Author").agg(["min", "max"]) )
print( data.groupby("Author").agg( {"Reviews":["min","max"], "User Rating":"sum"} ) )
Group by(multiple arguments)
print( data.groupby(["Author", "Year"]).count() )
Merge & Concat
import pandas as pd
import numpy as np
#let's create a new dataframe
df1 = pd.DataFrame(
{
"A":["A0","A1","A2","A3"],
"B":["B0","B1","B2","B3"],
"C":["C0","C1","C2","C3"],
"D":["D0","D1","D2","D3"]
}
)
df2 = pd.DataFrame(
{
'A':['A4', 'A5', 'A6','A7'],
'B':['B4', 'B5', 'B6','B7'],
'C':['C4', 'C5', 'C6','C7'],
'D':['D4', 'D5', 'D6','D7']
}
)
print("-----Database1:-----")
print(df1)
print("-----Database2:-----")
print(df2)
concatenate dataframes (row level)
#row level concatenation
print( pd.concat([df1, df2]) )
print( pd.concat([df1, df2], ignore_index=True) )
concatenate dataframes (column level)
#column level concatenation
print( pd.concat([df1,df2], axis=1) )
Merge
inner join with .merge()
#using the logic from the picture above, let's merge
#creating dataframe
izq = pd.DataFrame(
{
'key' : ['k0', 'k1', 'k2','k3'],
'A' : ['A0', 'A1', 'A2','A3'],
'B': ['B0', 'B1', 'B2','B3']
}
)
der = pd.DataFrame(
{
'key' : ['k0', 'k1', 'k2','k3'],
'C' : ['C0', 'C1', 'C2','C3'],
'D': ['D0', 'D1', 'D2','D3']
}
)
print("-----Left database:-----")
print(izq)
print("-----Right database-----")
print(der)
#merge
merge1 = izq.merge(der , on="key")
print(merge1)
inner join with .merge() -different col names-
#MERGE 2
izq = pd.DataFrame({'key' : ['k0', 'k1', 'k2','k3'],
'A' : ['A0', 'A1', 'A2','A3'],
'B': ['B0', 'B1', 'B2','B3']})
der = pd.DataFrame({'key_2' : ['k0', 'k1', 'k2','k3'],
'C' : ['C0', 'C1', 'C2','C3'],
'D': ['D0', 'D1', 'D2','D3']})
print("-----Left database:-----")
print(izq)
print("-----Right database-----")
print(der)
merge2 = izq.merge(der, left_on="key", right_on="key_2")
print(merge2)
left join with .merge() -empty records in key col-
#let's create the dataframes
izq = pd.DataFrame({'key' : ['k0', 'k1', 'k2','k3'],
'A' : ['A0', 'A1', 'A2','A3'],
'B': ['B0', 'B1', 'B2','B3']})
der = pd.DataFrame({'key_2' : ['k0', 'k1', 'k2',np.nan],
'C' : ['C0', 'C1', 'C2','C3'],
'D': ['D0', 'D1', 'D2','D3']})
print("-----Left database:-----")
print(izq)
print("-----Right database-----")
print(der)
merge3 = izq.merge(der, left_on="key" , right_on="key_2", how = "left")
print(merge3)
Join
import pandas as pd
import numpy as np
izq = pd.DataFrame({'A': ['A0','A1','A2'],
'B':['B0','B1','B2']},
index=['k0','k1','k2'])
der =pd.DataFrame({'C': ['C0','C1','C2'],
'D':['D0','D1','D2']},
index=['k0','k2','k3'])
print("-----left database-----")
print(izq)
print("-----right database-----")
print(der)
inner join
print(izq.join(der, how="inner"))
outer join
print(izq.join(der, how="outer"))
left join
print(izq.join(der, how="left"))
right join
print(izq.join(der, how="right"))
In terms of performance and speed, what is better? Merge or Join? when it comes to big data.
Pivot Table
import pandas as pd
import numpy as np
#let's import our data
df_books = pd.read_csv("bestsellers-with-categories.csv")
#now let's explore it
print( df_books.head() )
print( df_books.pivot_table(index="Author", columns="Genre", values="User Rating") )
print( df_books.pivot_table(index="Genre", columns="Year", values="User Rating", aggfunc="mean") )
print( df_books.pivot_table(index="Genre", columns="Year", values="User Rating", aggfunc="median") )
Melt (or unpivot columns)
import numpy as np
import pandas as pd
financial = pd.read_csv("financial.csv", delimiter=";")
print(financial)
print( financial.melt(id_vars="Categoria") )
Apply
import pandas as pf
#creating my dataframe
data = pd.read_csv("bestsellers-with-categories.csv")
#and also let's create a function that multiplies whatever the argument is *2
def twotimes(value):
return value*2
#and now we can apply that function to User Rating
data["User Rating2"] = data["User Rating"].apply(twotimes)
print(data)
data["Rating3"] = data["User Rating"].apply(lambda x: x*3)
print(data)
Apply functions with conditions
#multiply *2 the records that meet the condition (genre = fiction)
data["RatingConditioned"] = data.apply(
lambda x : x["User Rating"]*2 if x["Genre"] == "Fiction" else x["User Rating"], axis = 1)
data