Pandas Tutorial

import pandas as pd

psg_players = pd.Series(['Navas', 'Mbappe', 'Neymar', 'Messi'], index=[1,7,10,30]) # Pandas series is important specify index because it is the way how we will manipulate data

psg_players

# What happen if we dont define the index. Pandas automatically assign a index: pd.Series(['Navas', 'Mbappe', 'Neymar', 'Messi'])

# Also we can assign a dictionary as Series: dictionary = {1:'Navas', 7:'Mbappe', 10:'Neymar', 30:"Messi"} pd.Series(dictionary)

# We define in psg_players variable an index (1,7,10,30) but what happen when call a value with a index that # we dont assign. Simple, it show us a error psg_players[0]

# We call the value with index that we assign psg_players[7]

# We can assign a dictionary as DataFrame dictionary = { 'Players':['Navas', 'Mbappe', 'Neymar', 'Messi'], 'height' : ['183.0', '170.0', '170.0', '165.0'], 'goals': [2,200, 200, 200] }

pd.DataFrame(dictionary, index=[1,7,10,30]) # As well, we can assign an index. If we dont assign an index, pandas automatically # assign a index start with 0

This chart is empty

Chart was probably not set up properly in the notebook

df_book=pd.read_csv('bestsellers-with-categories.csv', sep=',', header=0) # By default command read_csv separate the file by commas but if your file dont separate by commas # you can use the attribute sep=";". As well, also csv file dont have a header, you can use the # attribute header to select header for column index

df_book

df_book.columns

# Whit loc we can filter a data for columns (labels) df_book[0:4]

# We can add specific columns df_book.loc[0:4, ['Name', 'Author']]

# We can modify the column value df_book.loc[:, ['Reviews']]*-1

# We can also make checks with loc df_book.loc[:,['Author']] == 'JJ Smith'

# Whit iloc we can filter a data for rows (index) df_book.iloc[:, 0:3]

df_book.iloc[:2,2:]

# Whit attribute drop df_book.drop('Genre', axis=1, inplace=True) # axis=1 is Colums and axis=0 is rows # inplace=True means that we want to delete te column in our DataFrame by defect this attribute is False # because only delete de colum in this output.

# Other form that we can delete a column is with method "del" from python. In this case we dont nedd # to specify implace because "del" does delete the column from our DataFrame. del df_book['Price']

# We nedd to specify a row df_book.drop(0, axis=0) # If We need to change in out DataFrame we can specify inplace=True

# We can specify a range df_book.drop([0,1,2,3], axis=0)

# We can use method "range" from paython df_book.drop(range(0,7),axis=0)

df_book['New_Column'] = 0 # We can assing values in new column with iqual df_book

import numpy as np dictionary = { 'col1' : [1,2,3,np.nan], 'col2' : [4, np.nan, 6,7], 'col3' : ['a', 'b', 'c', None] } # We created a new DataFrame df = pd.DataFrame(dictionary) df

# With attribute "isnull" we can verify if a value is "nan" or "None" df.isnull()

# Whith "fillna" we can changes values from "nan" or "None" df.fillna('Missing')

# With "fillna" we can dicide with which value to fill the fields df.fillna(df['col1'].mean())

# With "dropna" attribute we delete rows that it have a nan or None values df.dropna()

df_book['Year'] > 2016

# We can assign a variable and filter all our DataFrame greater_than_2016 = df_book['Year'] > 2016

# Only show books greater than 2016 df_book[greater_than_2016]

# We can add other filter with & genre_fiction = df_book['Genre'] == 'Fiction' df_book[genre_fiction & greater_than_2016]

# with the "info" attribute brings us which columns I have, which indexes it has, how many of those values # are not null, the type of data that each column handles, how many records and columns we have # in the DataFrame. df_book.info()

# With the attribute "describe" we get statistical data only from the columns that contain number format. df_book.describe()

# We "tail" attribute brings us the last rows of DataFrame df_book.tail(2)

# We "head" attribute brings us the first rows of DataFrame df_book.head(3)

# "memory_usage" attribute show How much memory am I using in my DataFrame. This helps me to better manage # the models I am going to create. df_book.memory_usage(deep=True)

# "value_counts" attribute returns the count of unique values. df_book['Author'].value_counts()

# We created a duplicate value df_book = df_book.append(df_book.iloc[0])

# Whit "drop_duplicate" attribute delete duplicates in all DataFrame df_book.drop_duplicates()

# Whit "sort_values" attribute sort a column df_book.sort_values('Year', ascending=False)

df_book.groupby('Author').count()

df_book.groupby('Author').mean()

# Whit "reset_index" our DataFrame assign again index with numbers df_book.groupby('Author').mean().reset_index()

df_book.groupby('Author').sum().loc['William Davis']

# Whit "agg" attribute we can define a new DataFrame with certain aggregation functions. df_book.groupby('Author').agg('min', 'max')

df_book.groupby('Author').agg({'Reviews': ['min', 'max'], 'User Rating': ['sum']})

df_book.groupby(['Author', 'Year']).count()

# We created ours DataFrame df1 = pd.DataFrame({ 'A':['A0', 'A1', 'A2','A3'], 'B':['B0', 'B1', 'B2','B3'], 'C':['C0', 'C1', 'C2','C3'], 'D':['D0', 'D1', 'D2','D3'] }) df2 = pd.DataFrame({ 'A':['A4', 'A5', 'A6','A7'], 'B':['B4', 'B5', 'B6','B7'], 'C':['C4', 'C5', 'C6','C7'], 'D':['D4', 'D5', 'D6','D7'] })

pd.concat([df1,df2], ignore_index=True) # for default is concatenated by rows "axis=0" # Whit "ignore_index" our new DataFrame dont take previou index

# We concatenated with columns pd.concat([df1,df2], axis=1, ignore_index=True)

izq = pd.DataFrame({ 'key' : ['k0', 'k1', 'k2','k3'], 'A' : ['A0', 'A1', 'A2','A3'], 'B': ['B0', 'B1', 'B2','B3'] }) der = pd.DataFrame({ 'key' : ['k0', 'k1', 'k2','k3'], 'C' : ['C0', 'C1', 'C2','C3'], 'D': ['D0', 'D1', 'D2','D3'] })

# With merge we make join with common data. With merge we make join with common data. We need a common # column so that the merge can be done, otherwise an error will occur. izq.merge(der, on='key')

izq = pd.DataFrame({ 'key_1' : ['k0', 'k1', 'k2','k3'], 'A' : ['A0', 'A1', 'A2','A3'], 'B': ['B0', 'B1', 'B2','B3'] }) der = pd.DataFrame({ 'key_2' : ['k0', 'k1', 'k2','k3'], 'C' : ['C0', 'C1', 'C2','C3'], 'D': ['D0', 'D1', 'D2','D3'] })

# If we have two keys with different names we can use "left_on" and "right_on" izq.merge(der, left_on='key_1', right_on='key_2')

# With the "how" we specify how to combine our Dataframe. how='left' andf how='right' izq.merge(der, left_on='key_1', right_on='key_2', how='left')

# We created data izq = pd.DataFrame({ 'A': ['A0','A1','A2'], 'B':['B0','B1','B2']}, index=['k0','k1','k2']) der =pd.DataFrame({ 'C': ['C0','C1','C2'], 'D':['D0','D1','D2']}, index=['k0','k2','k3'])

# Whit "how" we play with our data (lef, right, others) izq.join(der, how='inner')

df_book.pivot_table(index='Author',columns='Genre',values='User Rating')

# We can add function wiht "aggfunc" df_book.pivot_table(index='Genre',columns='Year', values='User Rating',aggfunc='sum')

def two_times(value): return value * 2

# With apply we can call functions directly df_book['User Rating'].apply(two_times)

df_book['Rating 2'] = df_book['User Rating'].apply(two_times) df_book

# We can too use lambda functions df_book['User Rating2'] =df_book['User Rating'].apply(lambda x: x* 3) df_book