import pandas as pd
psg_players = pd.Series(['Navas', 'Mbappe', 'Neymar', 'Messi'], index=[1,7,10,30])
# Pandas series is important specify index because it is the way how we will manipulate data
psg_players
# What happen if we dont define the index. Pandas automatically assign a index:
pd.Series(['Navas', 'Mbappe', 'Neymar', 'Messi'])
# Also we can assign a dictionary as Series:
dictionary = {1:'Navas', 7:'Mbappe', 10:'Neymar', 30:"Messi"}
pd.Series(dictionary)
# We define in psg_players variable an index (1,7,10,30) but what happen when call a value with a index that
# we dont assign. Simple, it show us a error
psg_players[0]
# We call the value with index that we assign
psg_players[7]
# We can assign a dictionary as DataFrame
dictionary = {
'Players':['Navas', 'Mbappe', 'Neymar', 'Messi'],
'height' : ['183.0', '170.0', '170.0', '165.0'],
'goals': [2,200, 200, 200]
}
pd.DataFrame(dictionary, index=[1,7,10,30])
# As well, we can assign an index. If we dont assign an index, pandas automatically
# assign a index start with 0
This chart is empty
Chart was probably not set up properly in the notebook
df_book=pd.read_csv('bestsellers-with-categories.csv', sep=',', header=0)
# By default command read_csv separate the file by commas but if your file dont separate by commas
# you can use the attribute sep=";". As well, also csv file dont have a header, you can use the
# attribute header to select header for column index
df_book
df_book.columns
# Whit loc we can filter a data for columns (labels)
df_book[0:4]
# We can add specific columns
df_book.loc[0:4, ['Name', 'Author']]
# We can modify the column value
df_book.loc[:, ['Reviews']]*-1
# We can also make checks with loc
df_book.loc[:,['Author']] == 'JJ Smith'
# Whit iloc we can filter a data for rows (index)
df_book.iloc[:, 0:3]
df_book.iloc[:2,2:]
# Whit attribute drop
df_book.drop('Genre', axis=1, inplace=True)
# axis=1 is Colums and axis=0 is rows
# inplace=True means that we want to delete te column in our DataFrame by defect this attribute is False
# because only delete de colum in this output.
# Other form that we can delete a column is with method "del" from python. In this case we dont nedd
# to specify implace because "del" does delete the column from our DataFrame.
del df_book['Price']
# We nedd to specify a row
df_book.drop(0, axis=0)
# If We need to change in out DataFrame we can specify inplace=True
# We can specify a range
df_book.drop([0,1,2,3], axis=0)
# We can use method "range" from paython
df_book.drop(range(0,7),axis=0)
df_book['New_Column'] = 0
# We can assing values in new column with iqual
df_book
import numpy as np
dictionary = {
'col1' : [1,2,3,np.nan],
'col2' : [4, np.nan, 6,7],
'col3' : ['a', 'b', 'c', None]
}
# We created a new DataFrame
df = pd.DataFrame(dictionary)
df
# With attribute "isnull" we can verify if a value is "nan" or "None"
df.isnull()
# Whith "fillna" we can changes values from "nan" or "None"
df.fillna('Missing')
# With "fillna" we can dicide with which value to fill the fields
df.fillna(df['col1'].mean())
# With "dropna" attribute we delete rows that it have a nan or None values
df.dropna()
df_book['Year'] > 2016
# We can assign a variable and filter all our DataFrame
greater_than_2016 = df_book['Year'] > 2016
# Only show books greater than 2016
df_book[greater_than_2016]
# We can add other filter with &
genre_fiction = df_book['Genre'] == 'Fiction'
df_book[genre_fiction & greater_than_2016]
# with the "info" attribute brings us which columns I have, which indexes it has, how many of those values
# are not null, the type of data that each column handles, how many records and columns we have
# in the DataFrame.
df_book.info()
# With the attribute "describe" we get statistical data only from the columns that contain number format.
df_book.describe()
# We "tail" attribute brings us the last rows of DataFrame
df_book.tail(2)
# We "head" attribute brings us the first rows of DataFrame
df_book.head(3)
# "memory_usage" attribute show How much memory am I using in my DataFrame. This helps me to better manage
# the models I am going to create.
df_book.memory_usage(deep=True)
# "value_counts" attribute returns the count of unique values.
df_book['Author'].value_counts()
# We created a duplicate value
df_book = df_book.append(df_book.iloc[0])
# Whit "drop_duplicate" attribute delete duplicates in all DataFrame
df_book.drop_duplicates()
# Whit "sort_values" attribute sort a column
df_book.sort_values('Year', ascending=False)
df_book.groupby('Author').count()
df_book.groupby('Author').mean()
# Whit "reset_index" our DataFrame assign again index with numbers
df_book.groupby('Author').mean().reset_index()
df_book.groupby('Author').sum().loc['William Davis']
# Whit "agg" attribute we can define a new DataFrame with certain aggregation functions.
df_book.groupby('Author').agg('min', 'max')
df_book.groupby('Author').agg({'Reviews': ['min', 'max'], 'User Rating': ['sum']})
df_book.groupby(['Author', 'Year']).count()
# We created ours DataFrame
df1 = pd.DataFrame({
'A':['A0', 'A1', 'A2','A3'],
'B':['B0', 'B1', 'B2','B3'],
'C':['C0', 'C1', 'C2','C3'],
'D':['D0', 'D1', 'D2','D3']
})
df2 = pd.DataFrame({
'A':['A4', 'A5', 'A6','A7'],
'B':['B4', 'B5', 'B6','B7'],
'C':['C4', 'C5', 'C6','C7'],
'D':['D4', 'D5', 'D6','D7']
})
pd.concat([df1,df2], ignore_index=True)
# for default is concatenated by rows "axis=0"
# Whit "ignore_index" our new DataFrame dont take previou index
# We concatenated with columns
pd.concat([df1,df2], axis=1, ignore_index=True)
izq = pd.DataFrame({
'key' : ['k0', 'k1', 'k2','k3'],
'A' : ['A0', 'A1', 'A2','A3'],
'B': ['B0', 'B1', 'B2','B3']
})
der = pd.DataFrame({
'key' : ['k0', 'k1', 'k2','k3'],
'C' : ['C0', 'C1', 'C2','C3'],
'D': ['D0', 'D1', 'D2','D3']
})
# With merge we make join with common data. With merge we make join with common data. We need a common
# column so that the merge can be done, otherwise an error will occur.
izq.merge(der, on='key')
izq = pd.DataFrame({
'key_1' : ['k0', 'k1', 'k2','k3'],
'A' : ['A0', 'A1', 'A2','A3'],
'B': ['B0', 'B1', 'B2','B3']
})
der = pd.DataFrame({
'key_2' : ['k0', 'k1', 'k2','k3'],
'C' : ['C0', 'C1', 'C2','C3'],
'D': ['D0', 'D1', 'D2','D3']
})
# If we have two keys with different names we can use "left_on" and "right_on"
izq.merge(der, left_on='key_1', right_on='key_2')
# With the "how" we specify how to combine our Dataframe. how='left' andf how='right'
izq.merge(der, left_on='key_1', right_on='key_2', how='left')
# We created data
izq = pd.DataFrame({
'A': ['A0','A1','A2'],
'B':['B0','B1','B2']},
index=['k0','k1','k2'])
der =pd.DataFrame({
'C': ['C0','C1','C2'],
'D':['D0','D1','D2']},
index=['k0','k2','k3'])
# Whit "how" we play with our data (lef, right, others)
izq.join(der, how='inner')
df_book.pivot_table(index='Author',columns='Genre',values='User Rating')
# We can add function wiht "aggfunc"
df_book.pivot_table(index='Genre',columns='Year', values='User Rating',aggfunc='sum')
def two_times(value):
return value * 2
# With apply we can call functions directly
df_book['User Rating'].apply(two_times)
df_book['Rating 2'] = df_book['User Rating'].apply(two_times)
df_book
# We can too use lambda functions
df_book['User Rating2'] =df_book['User Rating'].apply(lambda x: x* 3)
df_book