import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn import ensemble
import warnings
warnings.simplefilter('ignore')
transportation = pd.read_csv('Monthly_Transportation_Statistics (2).csv')
transportation
transportation.columns
# Select relevant columns from transportation df and rename
renamed = transportation.rename(columns={
'State and Local Government Construction Spending - Water Treatment Plant': 'spending_water_treatment_plant',
'State and Local Government Construction Spending - Water Supply': 'spending_water_supply',
'State and Local Government Construction Spending - Waste Water Treatment Plant': 'spending_waste_water_treatment_plant',
'State and Local Government Construction Spending - Waste Water': 'spending_waste_water',
'State and Local Government Construction Spending - Water': 'spending_water',
'Transit Ridership - Urban Rail - Adjusted': 'ridership_urban_rail',
'Transit Ridership - Fixed Route Bus - Adjusted': 'ridership_bus_route',
'Transit Ridership - Other Transit Modes - Adjusted': 'ridership_other_transit_modes',
'Transportation Employment - Water Transportation': 'employment_water'
})
transportation_clean = renamed[[
'Date',
'spending_water_treatment_plant',
'spending_water_supply',
'spending_waste_water_treatment_plant',
'spending_waste_water',
'spending_water',
'ridership_urban_rail',
'ridership_bus_route',
'ridership_other_transit_modes',
'employment_water'
]]
transportation_viz = renamed[[
'spending_water_treatment_plant',
'spending_water_supply',
'spending_waste_water_treatment_plant',
'spending_waste_water',
'spending_water',
'employment_water'
]]
transportation_clean.head()
# CATEGORICAL VISUALIZATION #1
# Transform dates into usable format, and only use data post-2018
transportation_clean['Date'] = pd.to_datetime(transportation_clean['Date'])
transportation_clean['Year'] = transportation_clean.apply(lambda x: x['Date'].year, axis=1)
after_2018 = transportation_clean[transportation_clean['Year'] > 2018]
# Plot year vs. three different transportation types
plt.figure(figsize=(12,7))
sns.lineplot(x=after_2018['Date'], y=after_2018['ridership_urban_rail'], label="Urban Rail")
sns.lineplot(x=after_2018['Date'], y=after_2018['ridership_bus_route'], label="Bus Route")
sns.lineplot(x=after_2018['Date'], y=after_2018['ridership_other_transit_modes'], label="Other Transit Modes")
plt.title("Transit Ridership Over Time")
plt.legend();
# Noticed "Other Transit Modes" didn't look like other two lines, so plotted by itself
# Turns out shape was pretty much the same - only looked constant in relation to others
# since they have different value ranges.
plt.figure(figsize=(12,7))
sns.lineplot(x=after_2018['Date'], y=after_2018['ridership_other_transit_modes'], label="Other Transit Modes")
plt.title("Other Transit Modes Ridership Over Time");
# CATEGORICAL VISUALIZATION #2
plt.figure(figsize=(12,7))
sns.barplot(x="Year", y="spending_water", data=transportation_clean[transportation_clean['Year'] > 2010])
plt.title("Distribution of Construction Spending on Water After 2010");
# QUANTITATIVE VISUALIZATION #1
# Credit: taken from Lab 7
# make a pairplot illustrating the pairwise correlations between different columns in the dataset
fig = sns.pairplot(transportation_viz, plot_kws=dict(marker="o", alpha = 0.5))
for i, j in zip(*np.triu_indices_from(fig.axes, 1)):
fig.axes[i, j].set_visible(False)
plt.show()
transportation
# QUANTITATIVE VISUALIZATION #2
f, (ax1, ax2) = plt.subplots(1,2, figsize=(12, 7))
sns.distplot(transportation_clean['spending_water'], ax=ax1)
sns.distplot(transportation_clean['employment_water'], ax=ax2)
plt.show();
plt.scatter(transportation_clean['spending_water'], transportation_clean['employment_water'])