Data Cleaning
Monthly Sea Level Data
from openpyxl import load_workbook
sealevel_wb = load_workbook(r'sea-levels-unsmoothed.xlsx')
# store each worksheet in a variable
sealevelstations_ws = sealevel_wb['Sea Level Stations']
sealevelmonthly_ws = sealevel_wb['Sea Level Monthly']
sealevelseasonal_ws = sealevel_wb['Sea Level Seasonal']
sealevelunstandardized_ws = sealevel_wb['Sea Level Unstandardized']
import pandas as pd
# columns parameter gets first row to be the header line
sealevelstations_header = pd.DataFrame(sealevelstations_ws.values, columns=next(sealevelstations_ws.values)[0:])
sealevelmonthly_header = pd.DataFrame(sealevelmonthly_ws.values, columns=next(sealevelmonthly_ws.values)[0:])
sealevelseasonal_header = pd.DataFrame(sealevelseasonal_ws.values, columns=next(sealevelseasonal_ws.values)[0:])
sealevelunstandardized_header = pd.DataFrame(sealevelunstandardized_ws.values, columns=next(sealevelunstandardized_ws.values)[0:])
sealevelstations_header
# drops the repeated first row
sealevelstationsdf = sealevelstations_header.iloc[1: , :].reset_index(drop=True)
sealevelmonthlydf = sealevelmonthly_header.iloc[1: , :].reset_index(drop=True)
sealevelseasonaldf = sealevelseasonal_header.iloc[1: , :].reset_index(drop=True)
sealevelunstandardizeddf = sealevelunstandardized_header.iloc[1: , :].reset_index(drop=True)
sealevelstationsdf.head()
sealevelmonthlydf.iloc[:,0:14]
sealevelmonthlydf_copy = sealevelmonthlydf.copy()
sealevelmonthlydf_copy = sealevelmonthlydf_copy.transpose()
# lift out first row as column headers
headers = sealevelmonthlydf_copy.iloc[0]
sealevelmonthlydf_copy = sealevelmonthlydf_copy[1:]
sealevelmonthlydf_copy.columns = headers
sealevelmonthlydf_copy
#
sealevelmonthlydf_copy2 = sealevelmonthlydf_copy.copy()
del sealevelmonthlydf_copy2['Month']
sealevelmonthlydf_copy2.head()
sealevelmonthlydf_copy2.to_csv('sealevel_nomonths.csv')
months = {1:'January',
2:'February',
3:'March',
4:'April',
5:'May',
6:'June',
7:'July',
8:'August',
9:'September',
10:'October',
11:'November',
12:'December'}
# replace numbers in Month column with their respective month using the dictionary
sealevelmonthlydf_copy.iloc[:,0].replace(months, inplace=True)
sealevelmonthlydf_copy
# convert index (year) to strings
sealevelmonthlydf_copy.index = sealevelmonthlydf_copy.index.astype(str)
# rename index with the original index (year) plus the value in the first column (month)
sealevelmonthlydf_copy.index = sealevelmonthlydf_copy.index + ' ' + sealevelmonthlydf_copy.iloc[:,0]
# drop Month column
del sealevelmonthlydf_copy['Month']
sealevelmonthlydf_copy
# change column data type for regions
sealevelmonthlydf_copy = sealevelmonthlydf_copy.astype(float)
sealevelmonthlydf_copy.dtypes
sealevelmonthlydf_copy
sealevelmonthlydf_copy.to_csv( 'cleaned_sealevelmonthly.csv' )
Shark Attack Data
shark_df = pd.read_csv('global-shark-attack.csv',
names=['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species',
'Investigator or Source', 'pdf', 'href formula', 'href', 'original order'])
shark_df.head()
# drop first row row with column names
shark_df = shark_df.drop(shark_df.index[0])
shark_df = shark_df.reset_index()
del shark_df['index']
shark_df
shark_df['Year'].unique()
shark_df['Year'] = shark_df['Year'].astype(float)
shark_df = shark_df.dropna(subset=['Year']).reset_index()
shark_df = shark_df[shark_df['Year'] >= 1961]
del shark_df['index']
shark_df.head()
shark_df['Month'] = pd.DatetimeIndex(shark_df['Date']).month
shark_df['Month'] = shark_df['Month'].map(months)
shark_df = shark_df[~shark_df['Year'].isnull()]
shark_df['Year'] = shark_df['Year'].astype(int)
shark_df['Year Month'] = shark_df['Year'].astype(str) + ' ' + shark_df['Month'].astype(str)
shark_df = shark_df.reset_index()
del shark_df['index']
shark_df
shark_df['Year'].unique()
# fill in missing country data
shark_df['Country'] = shark_df['Country'].fillna('NO COUNTRY REPORTED')
# assume that the country reported is accurate - remove question marks
shark_df['Country'] = shark_df['Country'].str.replace('?', '')
# convert all countries to uppercase letters
shark_df['Country'] = shark_df['Country'].str.upper()
shark_df
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:5: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.
"""
# replace repeat locations
# in some instances, countries reported were based on their colonial names,
# so we replace the colonial names with their current ones
def clean_country( wrong_country, right_country):
shark_df['Country'] = shark_df['Country'].str.replace( wrong_country, right_country, regex=False )
clean_country( 'OKINAWA', 'JAPAN' )
clean_country('UNITED ARAB EMIRATES (UAE)', 'UNITED ARAB EMIRATES')
clean_country( 'MALDIVE ISLANDS', 'MALDIVES' )
clean_country( 'NEW BRITAIN', 'NEW GUINEA' )
clean_country( 'BRITISH NEW GUINEA', 'NEW GUINEA')
clean_country( 'TERRITORY OF PAPUA', 'NEW GUINEA' )
clean_country( 'PAPUA NEW GUINEA', 'NEW GUINEA')
clean_country( 'ST. MAARTIN', 'ST MARTIN' )
clean_country( 'ST. MARTIN', 'ST MARTIN' )
clean_country( 'TOBAGO', 'TRINIDAD & TOBAGO' )
clean_country( 'TRINIDAD & TRINIDAD & TOBAGO', 'TRINIDAD & TOBAGO' )
clean_country( 'CEYLON', 'SRI LANKA' )
clean_country( 'CEYLON (SRI LANKA)', 'SRI LANKA' )
clean_country( 'SRI LANKA (SRI LANKA)', 'SRI LANKA')
clean_country( 'COLUMBIA', 'COLOMBIA' )
clean_country( 'REUNION ISLAND', 'REUNION' )
clean_country( 'ANDAMAN / NICOBAR ISLANDAS', 'ANDAMAN AND NICOBAR ISLANDAS' )
clean_country( 'ANDAMAN ISLANDS', 'ANDAMAN AND NICOBAR ISLANDAS' )
clean_country( 'ENGLAND', 'UNITED KINGDOM' )
clean_country( 'BURMA', 'MYANMAR')
clean_country( 'COAST OF AFRICA', 'AFRICA' )
clean_country( 'CRETE', 'GREECE' )
clean_country( 'GRAND CAYMAN', 'CAYMAN ISLANDS' )
clean_country( 'BRITISH WEST INDIES', 'WEST INDIES' )
clean_country( 'SOLOMON ISLANDS / VANUATU', 'SOLOMON ISLANDS' )
clean_country( 'ST KITTS / NEVIS', 'ST KITTS AND NEVIS' )
clean_country( 'BETWEEN PORTUGAL & INDIA', 'UNSURE')
clean_country( 'EGYPT / ISRAEL', 'UNSURE')
clean_country( 'EQUATORIAL GUINEA / CAMEROON', 'UNSURE')
clean_country( 'IRAN / IRAQ', 'UNSURE')
clean_country( 'ITALY / CROATIA', 'UNSURE')
clean_country( 'RED SEA / INDIAN OCEAN', 'UNSURE')
unique_list_of_countries = list( shark_df['Country'].unique() )
sorted(unique_list_of_countries)
shark_df.dtypes
import numpy as np
shark_df['Fatal==True'] = np.where( shark_df['Fatal (Y/N)'] == 'Y', 1, 0 )
del shark_df['Fatal (Y/N)']
shark_df['Fatal==True'] = shark_df['Fatal==True'].astype(bool)
shark_df.head()
def replace_type( type ):
shark_df['Type'] = shark_df['Type'].replace( type, np.nan)
replace_type( 'Invalid' )
replace_type( 'Questionable' )
replace_type( 'Watercraft' )
replace_type( 'Sea Disaster' )
replace_type( 'Boat' )
replace_type( 'Unverified' )
replace_type( 'Under investigation' )
replace_type( 'Unconfirmed' )
shark_df['Type'].unique()
shark_df['Sex'].fillna(np.nan)
def replace_sex( sex, new_sex ):
shark_df['Sex'] = shark_df['Sex'].replace( sex, new_sex )
replace_sex( 'lli', np.nancumprod)
replace_sex( 'N', np.nan )
replace_sex( 'M x 2', np.nan )
replace_sex( '.', np.nan )
shark_df['Sex'].unique()
shark_df['Age'].unique()
def replace_age( age, new_age ):
shark_df['Age'] = shark_df['Age'].replace( age, new_age )
replace_age( 'MAKE LINE GREEN', np.nan )
replace_age( 'A.M.', np.nan )
replace_age( 'F', np.nan )
replace_age( 'X', np.nan )
replace_age( '20?', 20 )
replace_age( '6½', 6.5 )
replace_age( '2½', 2.5 )
replace_age( 'Ca. 33', 33 )
replace_age( '2 to 3 months', 0.21 )
replace_age( '9 months', 0.75 )
replace_age( '18 months', 1.5 )
# looks like two people were reported in one line - replace with null value
replace_age( '9 & 12', np.nan )
replace_age( '9 & 60', np.nan )
replace_age( 'Both 11', np.nan)
replace_age( '21 & ?', np.nan )
replace_age( '? & 14', np.nan )
replace_age( '? & 19', np.nan )
replace_age( '7 & 31', np.nan )
replace_age( '23 & 26', np.nan )
replace_age( '36 & 23', np.nan )
replace_age( '46 & 34', np.nan )
replace_age( '28, 23 & 30', np.nan )
replace_age( '45 and 15', np.nan )
replace_age( '21, 34,24 & 35', np.nan )
replace_age( '37, 67, 35, 27, ? & 27', np.nan )
replace_age( '30 & 32', np.nan )
replace_age( '36 & 26', np.nan )
replace_age( '17 & 16', np.nan )
replace_age( '50 & 30', np.nan )
replace_age( '28 & 26', np.nan )
replace_age( '34 & 19', np.nan )
replace_age( '33 & 26', np.nan )
replace_age( '17 & 35', np.nan )
replace_age( '22, 57, 31', np.nan )
replace_age( '32 & 30', np.nan )
replace_age( '33 & 37', np.nan )
replace_age( '28 & 22', np.nan )
replace_age( '23 & 20', np.nan )
# x or y - replace with average
replace_age( '7 or 8', 7.5 )
replace_age( '8 or 10', 9 )
replace_age( '9 or 10', 9.5 )
replace_age( '10 or 12', 11 )
replace_age( '12 or 13', 12.5 )
replace_age( '13 or 14', 13.5 )
replace_age( '13 or 18', 15.5 )
replace_age( '18 or 20', 19 )
replace_age( '21 or 26', 23.5 )
replace_age( '25 or 28', 26.5 )
replace_age( '30 or 36', 33 )
replace_age( '31 or 33', 32 )
replace_age( '33 or 37', 35 )
# centreal value for age buckets
replace_age( '16 to 18', 17 )
replace_age( '18 to 22', 20 )
replace_age( '20s', 25 )
replace_age( "20's", 25 )
replace_age( 'mid-20s', 25 )
replace_age( '25 to 35', 30 )
replace_age( '30s', 35 )
replace_age( 'mid-30s', 35)
replace_age( '40s', 45 )
replace_age( '50s', 55 )
replace_age( '>50', 60)
replace_age( '60s', 65 )
replace_age( "60's", 65 )
replace_age( '80s', 85 )
replace_age( 'young', np.nan )
replace_age( 'Elderly', np.nan )
replace_age( 'Teen', 15 )
replace_age( 'teen', 15 )
replace_age( 'Teens', 15 )
replace_age( 'middle-age', 45 )
replace_age( 'a minor', np.nan )
replace_age( 'adult', np.nan )
replace_age( '(adult)', np.nan)
shark_df['Age'] = shark_df['Age'].astype( float )
shark_df.dtypes
shark_df = shark_df.drop(['Area', 'Location', 'Activity', 'Name', 'Injury', 'Time', 'Species',
'Investigator or Source', 'pdf', 'href formula', 'href', 'original order'], axis=1)
shark_df.head()
shark_df.to_csv('cleaned_sharks.csv')