import pandas as pd
sheets = pd.read_excel('shipping1997.xlsx', sheet_name = None)
sheets.keys()
shipping = sheets['shipping']
states = sheets['states']
commodities = sheets['commodities']
distances = sheets['distances']
states.head()
# Converting abbreviations
convert_abb_to_name = dict( zip( states['Abbreviation'], states['State'] ) )
shipping['origin'] = shipping['origin'].map( convert_abb_to_name )
shipping['dest'] = shipping['dest'].map( convert_abb_to_name )
shipping.head()
# changing adjacent column in distances dataframe
distances['adjacent'] = distances['adjacent'].astype('bool')
distances.head()
# wanted to know the num of rows
shipping.info
# Changing abbreviations in distances dataframe to names
distances['origin'] = distances['origin'].map( convert_abb_to_name )
distances['dest'] = distances['dest'].map( convert_abb_to_name )
distances.head()
# merging dataframes
merged = pd.merge(shipping, distances, on = ['origin', 'dest'])
# checking to see if num of rows is correct
merged.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 107500 entries, 0 to 107499
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 origin 107500 non-null object
1 dest 107500 non-null object
2 SCTG Code 107500 non-null int64
3 Value($ mil) 107500 non-null object
4 Tons(000) 107500 non-null object
5 Ton-miles(mil) 107500 non-null object
6 Distance(km) 107500 non-null float64
7 adjacent 107500 non-null bool
dtypes: bool(1), float64(1), int64(1), object(5)
memory usage: 6.7+ MB
merged.head()