# imports the pandas library
import pandas as pd
# uses the pandas library to read a csv, then saves the result
# a dataframe, to a local variable
# df = pd.read_csv('/work/richest.csv')
# causes the dataframe to be shown below the cell
# df
# Square brackets are used to show a particular column.
# Sum is a built in function used to add together all
# values in a column. However, we see a problem - as the
# wealth values are a text field, they get turned into one
# big string rather than being added together.
# df['Wealth'].sum()
# The dtypes property shows the datatypes of all the
# columns in a dataframe. Here, we have the following
# int64 - An integer (whole number) stored in 64 bits of space.
# float64 - A floating point decimal (the decimal point can
# move) stored in 64 bits of space.
# object - strings in pandas are listed as an object
# this is because they have some extra capabilites beyond
# a normal python string
# df.dtypes
# This turns the value from Wealth into a string
# (see note above about how strings are objects)
# and then uses the built in strip function to remove
# certain characters
# df['Wealth'] = df['Wealth'].str.strip('$')
# df['Wealth'] = df['Wealth'].str.strip('B')
# df
# We can then convert the type to be float, a decimal.
# In each case, we're creating a new column to overwrite
# the old column, with some operations performed on it.
# df['Wealth'] = df['Wealth'].astype('float')
# Makes Wealth a number
# We now see Wealth is a floating point value
# df.dtypes
# Describe is a built in function to show some useful stats
# count - shows the amount of values in the dataset
# std - standard deviation, the square root of the
# average of the squared distance between values
# don't worry, we'll come to this later!
# min - the minimum value in a dataset
# max - the maximum value in a dataset
# 25% - a quarter of the values are less than this
# 50% - effectively the median. half the values
# are less than this
# 75% - a quarter of the values are greater than this
# df.describe()
# You can also call these as specific functions, e.g. median
# df['Wealth'].median()
# This produces a histogram - a type of chart used to show
# the distribution of a dataset. The argument passed in sets
# the number of 'bins' used to group data.
# df['Wealth'].hist(bins=30)
# Another histogram, this time on the age dataset.
# df['Age'].hist(bins=10)
# df['Country'].plot.pie()
# This uses plotly, a plotting library we will look at later
# import plotly.express as px
# Creating a piechart, specifying the values and labels
# fig = px.pie(df, values=df['Wealth'], names=df['Country'], title='200 Richest Reople: wealth by country')
# Shows the chart.
# fig.show()
# Importing matplotlib, a more traditional library
# from matplotlib import pyplot as plt
# This filters a dataset based on a particular value
# Here, we are saying we only want to see people with less
# than 11 billion
# df_stem = df[df['Wealth']<11]
# df_stem
# Creates a filter on those with over 150 billion
# tops = df['Wealth']>150
# Creates a filter on those with less than 11 billion
# bottoms = df['Wealth']<11
# Applies the filter to the dataframe, using an OR to say
# either of these is fine!
# df_stem = df[tops | bottoms]
# Shows the resulting dataframe
# df_stem
# Importing matplotlib, a more traditional library
# from matplotlib import pyplot as plt
# median =
# plt.stem(df_stem['Wealth'], markerfmt='ro', bottom=18)
# my_range=range(1,len(df.head().index)+1)
# plt.tick_params(axis='x',
# changes apply to the x-axis
# which='both',
# both major and minor ticks are affected
# bottom=False,
# ticks along the bottom edge are off
# top=False,
# ticks along the top edge are off
# labelbottom=False)
# labels along the bottom edge are off
# plt.show()
# Reading in an additional dataset that has continent data
# iso_alpha_map = pd.read_csv('/work/continents.csv')
# Renaming the columns name, so it can be more easily
# matched to our wealth dataset.
# iso_alpha_map.rename(columns={'name': 'Country'}, inplace=True)
# iso_alpha_map
# Showing joining the dataframe with another, which has
# the locations of countries.
# df_with_iso = pd.merge(df, iso_alpha_map, on = 'Country')
# df_with_iso
# Creates a geographic plot
# import plotly.express as px
# fig = px.scatter_geo(df_with_iso, locations="alpha-3",
# color='Wealth',color_continuous_scale='RdBu',size='Wealth', # size of markers)
# fig.show()