import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option("display.max_rows", 8)
plt.rcParams['figure.figsize'] = (9, 6)
dates = pd.date_range('20130101', periods=6)
pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
This chart is empty
Chart was probably not set up properly in the notebook
pd.DataFrame({'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.arange(4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
url = "https://www.fun-mooc.fr/c4x/agrocampusouest/40001S03/asset/AnaDo_JeuDonnees_TemperatFrance.csv"
french_cities = pd.read_csv(url, delimiter=";", encoding="latin1", index_col=0)
french_cities
french_cities.head()
french_cities.tail()
french_cities.index
french_cities.index.name = "City"
french_cities.head()
import locale
import calendar
locale.setlocale(locale.LC_ALL,'C')
months = calendar.month_abbr
print(*months)
french_cities.rename(
columns={ old : new
for old, new in zip(french_cities.columns[:12], months[1:])
if old != new },
inplace=True)
french_cities.rename(columns={'Moye':'Mean'}, inplace=True)
french_cities
# Needs `lxml`, `beautifulSoup4` and `html5lib` python packages
table_list = pd.read_html("http://www.psmsl.org/data/obtaining/")
table_list
# there is 1 table on that page which contains metadata about the stations where
# sea levels are recorded
local_sea_level_stations = table_list[0]
local_sea_level_stations
french_cities['Lati'] # DF [] accesses columns (Series)
french_cities.Lati
french_cities.loc[:,'Lati']
french_cities.loc['Rennes', "Dec"]
french_cities.loc['Rennes', ["Sep", "Dec"]]
french_cities.loc['Rennes', "Sep":"Dec"]
mask = [True, False] * 6 + 5 * [False]
mask
mask = [True, False] * 6 + 5 * [False]
print(french_cities.iloc[:, mask])
print(french_cities.loc["Rennes", mask])
french_cities["std"] = french_cities.iloc[:,:12].std(axis=1)
french_cities
french_cities = french_cities.drop("std", axis=1) # remove this new column
french_cities
# french_cities['Rennes']['Sep'] = 25 # It does not works and breaks the DataFrame
french_cities.loc['Rennes']['Sep'] # = 25 is the right way to do it
french_cities
french_cities['Mean'].min(), french_cities['Ampl'].max()
def fahrenheit(T):
return T*9/5+32
list(map(lambda T: T*9/5+32, np.arange(20)))
fahrenheit = lambda T: T*9/5+32
fahrenheit(-15)
fahrenheit = lambda T: T*9/5+32
french_cities['Mean'].apply(fahrenheit)
french_cities.sort_values(by='Lati')
french_cities = french_cities.sort_values(by='Lati',ascending=False)
french_cities
pd.set_option("display.max_rows", 20)
unstacked = french_cities.iloc[:,:12].unstack()
unstacked
type(unstacked)
city_temp = french_cities.iloc[:,:12].transpose()
city_temp.plot()
city_temp.boxplot(rot=90);
french_cities['Région'].describe()
french_cities['Région'].unique()
french_cities['Région'].value_counts()
# To save memory, we can convert it to a categorical column:
french_cities["Région"] = french_cities["Région"].astype("category")
french_cities.memory_usage()
fc_grouped_region = french_cities.groupby("Région")
type(fc_grouped_region)
for group_name, subdf in fc_grouped_region:
print(group_name)
print(subdf)
print("")
df = pd.read_excel("Folds5x2_pp.xlsx")
df.corr()
import seaborn as sns
sns.pairplot(df)