import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option("display.max_rows", 8)
plt.rcParams['figure.figsize'] = (9, 6)
dates = pd.date_range('20130101', periods=6)
pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
This chart is empty
pd.DataFrame({'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.arange(4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
url = "https://www.fun-mooc.fr/c4x/agrocampusouest/40001S03/asset/AnaDo_JeuDonnees_TemperatFrance.csv"
french_cities = pd.read_csv(url, delimiter=";", encoding="latin1", index_col=0)
french_cities
french_cities.head()
french_cities.tail()
french_cities.index
french_cities.index.name = "City"
french_cities.head()
import locale
import calendar
locale.setlocale(locale.LC_ALL,'C')
months = calendar.month_abbr
print(*months)
french_cities.rename(
columns={ old : new
for old, new in zip(french_cities.columns[:12], months[1:])
if old != new },
inplace=True)
french_cities.rename(columns={'Moye':'Mean'}, inplace=True)
french_cities
# Needs `lxml`, `beautifulSoup4` and `html5lib` python packages
table_list = pd.read_html("http://www.psmsl.org/data/obtaining/")
table_list
# there is 1 table on that page which contains metadata about the stations where
# sea levels are recorded
local_sea_level_stations = table_list[0]
local_sea_level_stations
french_cities['Lati'] # DF [] accesses columns (Series)
french_cities.Lati
french_cities.loc[:,'Lati']
french_cities.loc['Rennes', "Dec"]
french_cities.loc['Rennes', ["Sep", "Dec"]]
french_cities.loc['Rennes', "Sep":"Dec"]
mask = [True, False] * 6 + 5 * [False]
mask
mask = [True, False] * 6 + 5 * [False]
print(french_cities.iloc[:, mask])
Jan Mar May Jul Sep Nov
City
Bordeaux 5.6 10.3 15.8 20.9 18.6 9.1
Brest 6.1 7.8 11.6 15.6 14.7 9.0
Clermont 2.6 7.5 13.8 19.4 16.2 6.6
Grenoble 1.5 7.7 14.5 20.1 16.7 6.5
... ... ... ... ... ... ...
Rennes 4.8 7.9 13.1 17.9 15.7 7.8
Strasbourg 0.4 5.6 14.0 19.0 15.1 4.9
Toulouse 4.7 9.2 14.9 20.9 18.3 8.6
Vichy 2.4 7.1 13.6 19.3 16.0 6.6
[15 rows x 6 columns]
print(french_cities.loc["Rennes", mask])
Jan 4.8
Mar 7.9
May 13.1
Jul 17.9
Sep 15.7
Nov 7.8
Name: Rennes, dtype: object
french_cities["std"] = french_cities.iloc[:,:12].std(axis=1)
french_cities
french_cities = french_cities.drop("std", axis=1) # remove this new column
french_cities
# french_cities['Rennes']['Sep'] = 25 # It does not works and breaks the DataFrame
french_cities.loc['Rennes']['Sep'] # = 25 is the right way to do it
french_cities
french_cities['Mean'].min(), french_cities['Ampl'].max()
def fahrenheit(T):
return T*9/5+32
list(map(lambda T: T*9/5+32, np.arange(20)))
fahrenheit = lambda T: T*9/5+32
fahrenheit(-15)
fahrenheit = lambda T: T*9/5+32
french_cities['Mean'].apply(fahrenheit)
french_cities.sort_values(by='Lati')
french_cities = french_cities.sort_values(by='Lati',ascending=False)
french_cities
pd.set_option("display.max_rows", 20)
unstacked = french_cities.iloc[:,:12].unstack()
unstacked
type(unstacked)
city_temp = french_cities.iloc[:,:12].transpose()
city_temp.plot()
city_temp.boxplot(rot=90);
french_cities['Région'].describe()
french_cities['Région'].unique()
french_cities['Région'].value_counts()
# To save memory, we can convert it to a categorical column:
french_cities["Région"] = french_cities["Région"].astype("category")
french_cities.memory_usage()
fc_grouped_region = french_cities.groupby("Région")
type(fc_grouped_region)
for group_name, subdf in fc_grouped_region:
print(group_name)
print(subdf)
print("")
NE
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec \
City
Lille 2.4 2.9 6.0 8.9 12.4 15.3 17.1 17.1 14.7 10.4 6.1 3.5
Paris 3.4 4.1 7.6 10.7 14.3 17.5 19.1 18.7 16.0 11.4 7.1 4.3
Strasbourg 0.4 1.5 5.6 9.8 14.0 17.2 19.0 18.3 15.1 9.5 4.9 1.3
Lati Long Mean Ampl Région
City
Lille 50.38 3.04 9.73 14.7 NE
Paris 48.52 2.20 11.18 15.7 NE
Strasbourg 48.35 7.45 9.72 18.6 NE
NO
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec \
City
Brest 6.1 5.8 7.8 9.2 11.6 14.4 15.6 16.0 14.7 12.0 9.0 7.0
Rennes 4.8 5.3 7.9 10.1 13.1 16.2 17.9 17.8 15.7 11.6 7.8 5.4
Nantes 5.0 5.3 8.4 10.8 13.9 17.2 18.8 18.6 16.4 12.2 8.2 5.5
Lati Long Mean Ampl Région
City
Brest 48.24 -4.29 10.77 10.2 NO
Rennes 48.05 -1.41 11.13 13.1 NO
Nantes 47.13 -1.33 11.69 13.8 NO
SE
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov \
City
Vichy 2.4 3.4 7.1 9.9 13.6 17.1 19.3 18.8 16.0 11.0 6.6
Clermont 2.6 3.7 7.5 10.3 13.8 17.3 19.4 19.1 16.2 11.2 6.6
Lyon 2.1 3.3 7.7 10.9 14.9 18.5 20.7 20.1 16.9 11.4 6.7
Grenoble 1.5 3.2 7.7 10.6 14.5 17.8 20.1 19.5 16.7 11.4 6.5
Nice 7.5 8.5 10.8 13.3 16.7 20.1 22.7 22.5 20.3 16.0 11.5
Montpellier 5.6 6.7 9.9 12.8 16.2 20.1 22.7 22.3 19.3 14.6 10.0
Marseille 5.5 6.6 10.0 13.0 16.8 20.8 23.3 22.8 19.9 15.0 10.2
Dec Lati Long Mean Ampl Région
City
Vichy 3.4 46.08 3.26 10.72 16.9 SE
Clermont 3.6 45.47 3.05 10.94 16.8 SE
Lyon 3.1 45.45 4.51 11.36 18.6 SE
Grenoble 2.3 45.10 5.43 10.98 18.6 SE
Nice 8.2 43.42 7.15 14.84 15.2 SE
Montpellier 6.5 43.36 3.53 13.89 17.1 SE
Marseille 6.9 43.18 5.24 14.23 17.8 SE
SO
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec \
City
Bordeaux 5.6 6.6 10.3 12.8 15.8 19.3 20.9 21.0 18.6 13.8 9.1 6.2
Toulouse 4.7 5.6 9.2 11.6 14.9 18.7 20.9 20.9 18.3 13.3 8.6 5.5
Lati Long Mean Ampl Région
City
Bordeaux 44.50 -0.34 13.33 15.4 SO
Toulouse 43.36 1.26 12.68 16.2 SO
df = pd.read_excel("Folds5x2_pp.xlsx")
df.corr()
import seaborn as sns
sns.pairplot(df)