import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
ski_data = pd.read_csv('ski_resort_data.csv')
ski_data.info
ski_data.head()
ski_data[ski_data.Name == 'Big Mountain Resort'].T
missing = pd.concat([ski_data.isnull().sum(), 100 * ski_data.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count', ascending=False)
ski_data.select_dtypes('object')
ski_data['Name'].value_counts().head()
(ski_data['Name'] + ', ' + ski_data['Region']).value_counts().head()
(ski_data['Name'] + ', ' + ski_data['state']).value_counts().head()
ski_data[ski_data['Name'] == 'Crystal Mountain']
(ski_data.Region != ski_data.state).count()
ski_data['Region'].value_counts()
(ski_data[ski_data.Region != ski_data.state]
.groupby('state')['Region']
.value_counts())
ski_data[['Region', 'state']].nunique()
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 8))
#Specify a horizontal barplot ('barh') as kind of plot (kind=)
ski_data.Region.value_counts().plot(kind='barh', ax=ax[0])
#Give the plot a helpful title of 'Region'
ax[0].set_title('Region')
#Label the xaxis 'Count'
ax[0].set_xlabel('Count')
#Specify a horizontal barplot ('barh') as kind of plot (kind=)
ski_data.state.value_counts().plot(kind='barh', ax=ax[1])
#Give the plot a helpful title of 'state'
ax[1].set_title('state')
#Label the xaxis 'Count'
ax[1].set_xlabel('Count')
#Give the subplots a little "breathing room" with a wspace of 0.5
plt.subplots_adjust(wspace=0.5);
state_price_means = ski_data.groupby('state')[['AdultWeekday','AdultWeekend']].mean()
state_price_means.head()
(state_price_means.reindex(index=state_price_means.mean(axis=1)
.sort_values(ascending=False)
.index)
.plot(kind='barh', figsize=(10, 10), title='Average ticket price by State'))
plt.xlabel('Price ($)');
ticket_prices = pd.melt(ski_data[['state', 'AdultWeekday','AdultWeekend']],
id_vars=['state'],
var_name= 'Ticket',
value_vars=['AdultWeekday','AdultWeekend'],
value_name='Price')
ticket_prices.head()
plt.subplots(figsize=(12, 8))
sns.boxplot(x='state', y='Price', hue='Ticket', data=ticket_prices)
plt.xticks(rotation='vertical')
plt.ylabel('Price ($)')
plt.xlabel('State');
ski_data.describe().transpose()
missing_price = ski_data[['AdultWeekend', 'AdultWeekday']].isnull().sum(axis=1)
missing_price.value_counts()/len(missing_price) * 100
ski_data.hist(figsize=(15,10))
plt.subplots_adjust(hspace=0.5);
ski_data.loc[ski_data.SkiableTerrain_ac > 10000]
ski_data[ski_data. SkiableTerrain_ac > 10000].transpose()
ski_data.loc[39, 'SkiableTerrain_ac']
ski_data.loc[39, 'SkiableTerrain_ac'] = 1819
ski_data.loc[39, 'SkiableTerrain_ac']
ski_data.SkiableTerrain_ac.hist(bins=30)
plt.xlabel('SkiableTerrain_ac')
plt.ylabel('Count')
plt.title('Distribution of skiable area (acres) after replacing erroneous value');
ski_data['Snow Making_ac'][ski_data['Snow Making_ac'] > 1000]
ski_data[ski_data['Snow Making_ac'] > 3000].T
.6 * 4800
ski_data.fastEight.value_counts()
ski_data.drop(columns='fastEight', inplace=True)
ski_data.loc[ski_data.yearsOpen > 100]
ski_data['yearsOpen'].loc[ski_data.yearsOpen < 100].hist(bins=30)
plt.xlabel('Years open')
plt.ylabel('Count')
plt.title('Distribution of years open excluding 2019');
ski_data.yearsOpen[ski_data.yearsOpen < 1000].describe()
ski_data = ski_data[ski_data.yearsOpen < 1000]
state_summary = ski_data.groupby('state').agg(
resorts_per_state=pd.NamedAgg(column='Name', aggfunc='size'), #could pick any column here
state_total_skiable_area_ac=pd.NamedAgg(column='SkiableTerrain_ac', aggfunc='sum'),
state_total_days_open=pd.NamedAgg(column='daysOpenLastYear', aggfunc='sum'),
state_total_terrain_parks=pd.NamedAgg(column='TerrainParks', aggfunc='sum'),
state_total_nightskiing_ac=pd.NamedAgg(column='NightSkiing_ac', aggfunc='sum')
).reset_index()
state_summary.head()
missing_price = ski_data[['AdultWeekend', 'AdultWeekday']].isnull().sum(axis=1)
missing_price.value_counts()/len(missing_price) * 100
ski_data = ski_data[missing_price != 2]
ski_data.hist(figsize=(15, 10))
plt.subplots_adjust(hspace=0.5);
states_url = 'https://simple.wikipedia.org/w/index.php?title=List_of_U.S._states&oldid=7168473'
usa_states = pd.read_html(states_url)
type(usa_states)
len(usa_states)
usa_states = usa_states[0]
usa_states.head()
established = usa_states.iloc[:, 4]
established
usa_states_sub = usa_states.iloc[:, [0, 5, 6]].copy()
usa_states_sub.columns = ['state', 'state_population', 'state_area_sq_miles']
usa_states_sub.head()
missing_states = set(state_summary.state) - set(usa_states_sub.state)
missing_states
usa_states_sub.state[usa_states_sub.state.str.contains('Massachusetts|Pennsylvania|Rhode Island|Virginia')]
usa_states_sub.state.replace(to_replace='\[.*\]', value='', regex=True, inplace=True)
usa_states_sub.state[usa_states_sub.state.str.contains('Massachusetts|Pennsylvania|Rhode Island|Virginia')]
missing_states = set(state_summary.state) - set(usa_states_sub.state)
missing_states
state_summary = state_summary.merge(usa_states_sub, how='left', on='state')
state_summary.head()
ski_data.plot(x='AdultWeekday', y='AdultWeekend', kind='scatter');
ski_data.loc[ski_data.state == 'Montana', ['AdultWeekend', 'AdultWeekday']]
ski_data[['AdultWeekend', 'AdultWeekday']].isnull().sum()
ski_data.drop(columns='AdultWeekday', inplace=True)
ski_data.dropna(subset=['AdultWeekend'], inplace=True)
ski_data.shape
missing = pd.concat([ski_data.isnull().sum(axis=1), 100 * ski_data.isnull().mean(axis=1)], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count', ascending=False).head(10)
missing['%'].unique()
missing['%'].value_counts()
ski_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 277 entries, 0 to 329
Data columns (total 25 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 277 non-null object
1 Region 277 non-null object
2 state 277 non-null object
3 summit_elev 277 non-null int64
4 vertical_drop 277 non-null int64
5 base_elev 277 non-null int64
6 trams 277 non-null int64
7 fastSixes 277 non-null int64
8 fastQuads 277 non-null int64
9 quad 277 non-null int64
10 triple 277 non-null int64
11 double 277 non-null int64
12 surface 277 non-null int64
13 total_chairs 277 non-null int64
14 Runs 274 non-null float64
15 TerrainParks 233 non-null float64
16 LongestRun_mi 272 non-null float64
17 SkiableTerrain_ac 275 non-null float64
18 Snow Making_ac 240 non-null float64
19 daysOpenLastYear 233 non-null float64
20 yearsOpen 277 non-null float64
21 averageSnowfall 268 non-null float64
22 AdultWeekend 277 non-null float64
23 projectedDaysOpen 236 non-null float64
24 NightSkiing_ac 163 non-null float64
dtypes: float64(11), int64(11), object(3)
memory usage: 56.3+ KB
ski_data.shape
# save the data to a new csv file
ski_data.to_csv('ski_data_cleaned.csv',index=False)
# save the state_summary separately.
state_summary.to_csv('state_summary.csv', index = False)