Introduction to data

# imports the pandas library import pandas as pd

# uses the pandas library to read a csv, then saves the result # a dataframe, to a local variable # df = pd.read_csv('/work/richest.csv') # causes the dataframe to be shown below the cell # df

# Square brackets are used to show a particular column. # Sum is a built in function used to add together all # values in a column. However, we see a problem - as the # wealth values are a text field, they get turned into one # big string rather than being added together. # df['Wealth'].sum()

# The dtypes property shows the datatypes of all the # columns in a dataframe. Here, we have the following # int64 - An integer (whole number) stored in 64 bits of space. # float64 - A floating point decimal (the decimal point can # move) stored in 64 bits of space. # object - strings in pandas are listed as an object # this is because they have some extra capabilites beyond # a normal python string # df.dtypes

# This turns the value from Wealth into a string # (see note above about how strings are objects) # and then uses the built in strip function to remove # certain characters # df['Wealth'] = df['Wealth'].str.strip('$') # df['Wealth'] = df['Wealth'].str.strip('B') # df

# We can then convert the type to be float, a decimal. # In each case, we're creating a new column to overwrite # the old column, with some operations performed on it. # df['Wealth'] = df['Wealth'].astype('float') # Makes Wealth a number

# We now see Wealth is a floating point value # df.dtypes

# Describe is a built in function to show some useful stats # count - shows the amount of values in the dataset # std - standard deviation, the square root of the # average of the squared distance between values # don't worry, we'll come to this later! # min - the minimum value in a dataset # max - the maximum value in a dataset # 25% - a quarter of the values are less than this # 50% - effectively the median. half the values # are less than this # 75% - a quarter of the values are greater than this # df.describe()

# You can also call these as specific functions, e.g. median # df['Wealth'].median()

# This produces a histogram - a type of chart used to show # the distribution of a dataset. The argument passed in sets # the number of 'bins' used to group data. # df['Wealth'].hist(bins=30)

# Another histogram, this time on the age dataset. # df['Age'].hist(bins=10)

# df['Country'].plot.pie()

# This uses plotly, a plotting library we will look at later # import plotly.express as px # Creating a piechart, specifying the values and labels # fig = px.pie(df, values=df['Wealth'], names=df['Country'], title='200 Richest Reople: wealth by country') # Shows the chart. # fig.show()

# Importing matplotlib, a more traditional library # from matplotlib import pyplot as plt # This filters a dataset based on a particular value # Here, we are saying we only want to see people with less # than 11 billion # df_stem = df[df['Wealth']<11] # df_stem

# Creates a filter on those with over 150 billion # tops = df['Wealth']>150 # Creates a filter on those with less than 11 billion # bottoms = df['Wealth']<11 # Applies the filter to the dataframe, using an OR to say # either of these is fine! # df_stem = df[tops | bottoms] # Shows the resulting dataframe # df_stem

# Importing matplotlib, a more traditional library # from matplotlib import pyplot as plt # median = # plt.stem(df_stem['Wealth'], markerfmt='ro', bottom=18) # my_range=range(1,len(df.head().index)+1) # plt.tick_params(axis='x', # changes apply to the x-axis # which='both', # both major and minor ticks are affected # bottom=False, # ticks along the bottom edge are off # top=False, # ticks along the top edge are off # labelbottom=False) # labels along the bottom edge are off # plt.show()

# Reading in an additional dataset that has continent data # iso_alpha_map = pd.read_csv('/work/continents.csv') # Renaming the columns name, so it can be more easily # matched to our wealth dataset. # iso_alpha_map.rename(columns={'name': 'Country'}, inplace=True) # iso_alpha_map

# Showing joining the dataframe with another, which has # the locations of countries. # df_with_iso = pd.merge(df, iso_alpha_map, on = 'Country') # df_with_iso

# Creates a geographic plot # import plotly.express as px # fig = px.scatter_geo(df_with_iso, locations="alpha-3", # color='Wealth',color_continuous_scale='RdBu',size='Wealth', # size of markers) # fig.show()