import pandas as pd
!pip install xlrd
!pip install openpyxl
Requirement already satisfied: xlrd in /root/venv/lib/python3.7/site-packages (2.0.1)
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
Requirement already satisfied: openpyxl in /root/venv/lib/python3.7/site-packages (3.0.9)
Requirement already satisfied: et-xmlfile in /root/venv/lib/python3.7/site-packages (from openpyxl) (1.1.0)
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
percapitaGDP = pd.read_excel(
"https://github.com/PacktPublishing/Python-Data-Cleaning-Cookbook/raw/master/Chapter01/data/GDPpercapita.xlsx",
sheet_name="OECD.Stat export",
skiprows=4,
skipfooter=1,
usecols="A,C:T"
)
percapitaGDP.head()
percapitaGDP.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702 entries, 0 to 701
Data columns (total 19 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Year 702 non-null object
1 2001 701 non-null object
2 2002 701 non-null object
3 2003 701 non-null object
4 2004 701 non-null object
5 2005 701 non-null object
6 2006 701 non-null object
7 2007 701 non-null object
8 2008 701 non-null object
9 2009 701 non-null object
10 2010 701 non-null object
11 2011 701 non-null object
12 2012 701 non-null object
13 2013 701 non-null object
14 2014 701 non-null object
15 2015 701 non-null object
16 2016 701 non-null object
17 2017 701 non-null object
18 2018 701 non-null object
dtypes: object(19)
memory usage: 104.3+ KB
percapitaGDP.rename(columns={"Year":"metro"}, inplace=True)
percapitaGDP.metro = percapitaGDP.metro.str.strip()
percapitaGDP.dtypes
for col in percapitaGDP.columns[1:]:
percapitaGDP[col] = pd.to_numeric(percapitaGDP[col], errors='coerce')
percapitaGDP.rename(columns={col:'pcGDP'+col}, inplace=True)
percapitaGDP.dtypes
percapitaGDP.describe()
percapitaGDP.dropna(subset=percapitaGDP.columns[1:], how='all', inplace=True)
percapitaGDP.describe()
percapitaGDP.head()
percapitaGDP.metro.count() == percapitaGDP.metro.nunique()
percapitaGDP.set_index('metro', inplace=True)
percapitaGDP.head()
percapitaGDP.loc['AUS01: Greater Sydney']